/* nag_best_subset_given_size (h05abc) Example Program.
*
* Copyright 2017 Numerical Algorithms Group.
*
* Mark 26.1, 2017.
*/
/* Pre-processor includes */
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <nag.h>
#include <nag_stdlib.h>
#include <nagg02.h>
#include <nagh05.h>
#define BZ(I,J) bz[(J)*(m - ip) + (I)]
typedef struct
{
Integer n, tdq, tdx;
double tol;
double *x, *y, *b, *cov, *h, *p, *q, *res, *se, *wt, *com_ar;
Integer *sx, cnt;
Nag_IncludeMean mean;
} calc_subset_data;
void NAG_CALL read_subset_data(Integer m, calc_subset_data * cs);
void NAG_CALL tidy_subset_data(calc_subset_data * cs);
void NAG_CALL calc_subset_score(Integer m, Integer drop, Integer lz,
const Integer z[], Integer la,
const Integer a[], double score[],
Nag_Comm *comm, Integer *info);
int main(void)
{
int exit_status = 0;
/* Integer scalar and array declarations */
Integer i, ip, j, la, m, mincnt, mincr, mip, nbest;
Integer *a = 0, *bz = 0, *ibz = 0, *id = 0;
/* NAG structures */
NagError fail;
Nag_Comm comm;
/* Other data types */
calc_subset_data cs;
/* Double scalar and array declarations */
double gamma;
double *bscore = 0;
double acc[2];
/* Initialize the error structure */
INIT_FAIL(fail);
printf("nag_best_subset_given_size (h05abc) Example Program Results\n\n");
/* Skip heading in data file */
scanf("%*[^\n] ");
scanf("%*[^\n] ");
/* Read in the problem size */
scanf("%" NAG_IFMT "%" NAG_IFMT "%" NAG_IFMT "%*[^\n] ", &m, &ip, &nbest);
/* Read in the control parameters for the subset selection */
scanf("%" NAG_IFMT "%" NAG_IFMT "%lf%lf%lf%*[^\n] ", &mincr, &mincnt,
&gamma, &acc[0], &acc[1]);
/* Read in the data required for the score function */
read_subset_data(m, &cs);
/* Associate the data structure with comm.p */
comm.p = (void *) &cs;
/* Allocate memory required by the subset selection routine */
mip = m - ip;
if (!(a = NAG_ALLOC(MAX(nbest, m), Integer)) ||
!(bz = NAG_ALLOC(mip * nbest, Integer)) ||
!(bscore = NAG_ALLOC(MAX(nbest, m), double)))
{
printf("Allocation failure\n");
exit_status = -1;
goto END;
}
/* Call the forward communication best subset routine
(nag_best_subset_given_size (h05abc)) */
nag_best_subset_given_size(mincr, m, ip, nbest, &la, bscore, bz,
calc_subset_score, mincnt, gamma, acc, &comm,
&fail);
if (fail.code != NE_NOERROR) {
printf("Error from nag_best_subset_given_size (h05abc).\n%s\n",
fail.message);
if (fail.code != NE_ARG_TOO_LARGE) {
exit_status = 1;
goto END;
}
}
/* Titles */
printf(" Score Feature Subset\n");
printf(" ----- --------------\n");
/* Display the best subsets and corresponding scores.
nag_best_subset_given_size returns a list
of features excluded from the best subsets, so this is inverted to give
the set of features included in each subset */
if (!(id = NAG_ALLOC(m, Integer)))
{
printf("Allocation failure\n");
exit_status = -1;
goto END;
}
for (i = 0; i < la; i++) {
printf("%12.5e ", bscore[i]);
for (j = 0; j < m; j++)
id[j] = 1;
for (j = 0; j < mip; j++)
id[BZ(j, i) - 1] = 0;
for (j = 0; j < m; j++)
if (id[j])
printf(" %5" NAG_IFMT, j + 1);
printf("\n");
}
printf("\n");
if (fail.code == NE_TOO_MANY) {
printf("%" NAG_IFMT " subsets of the requested size do not exist, "
"only %" NAG_IFMT " are displayed.\n", nbest, la);
}
printf("%" NAG_IFMT " subsets evaluated in total\n", cs.cnt);
END:
NAG_FREE(a);
NAG_FREE(bz);
NAG_FREE(ibz);
NAG_FREE(bscore);
NAG_FREE(id);
tidy_subset_data(&cs);
return exit_status;
}
#define CSX(I, J) cs->x[(I) * cs->tdx + J]
void NAG_CALL read_subset_data(Integer m, calc_subset_data * cs)
{
/* Read in the data, from stdout, and allocate any arrays that are required
by the scoring function calc_subset_score */
Integer i, j;
cs->sx = 0;
cs->x = cs->y = cs->b = cs->se = cs->cov = cs->res = 0;
cs->h = cs->q = cs->p = cs->com_ar = cs->wt = 0;
/* Skip heading in data file */
scanf("%*[^\n] ");
/* Read in the number of observations for the data used in the linear
regression */
scanf("%" NAG_IFMT "%*[^\n] ", &cs->n);
/* Read in the control parameters for the linear regression */
scanf("%lf%*[^\n] ", &cs->tol);
/* Read in the data */
cs->tdx = m;
if (!(cs->x = NAG_ALLOC(cs->tdx * cs->n, double)) ||
!(cs->y = NAG_ALLOC(cs->n, double)))
{
printf("Allocation failure\n");
tidy_subset_data(cs);
exit(-1);
}
/* Read in the data for the linear regression */
for (i = 0; i < cs->n; i++) {
for (j = 0; j < m; j++)
scanf("%lf", &CSX(i, j));
scanf("%lf", &cs->y[i]);
scanf("%*[^\n] ");
}
/* No intercept term and no weights */
cs->mean = Nag_MeanZero;
/* Allocate memory required by the regression routine */
cs->tdq = cs->n;
if (!(cs->sx = NAG_ALLOC(m, Integer)) ||
!(cs->b = NAG_ALLOC(m, double)) ||
!(cs->se = NAG_ALLOC(m, double)) ||
!(cs->cov = NAG_ALLOC(m * (m + 1) / 2, double)) ||
!(cs->res = NAG_ALLOC(cs->n, double)) ||
!(cs->h = NAG_ALLOC(cs->n, double)) ||
!(cs->q = NAG_ALLOC(cs->n * cs->tdq, double)) ||
!(cs->p = NAG_ALLOC(2 * m + m * m, double)) ||
!(cs->com_ar = NAG_ALLOC(5 * (m - 1) * m * m, double)))
{
printf("Allocation failure\n");
tidy_subset_data(cs);
exit(-1);
}
cs->cnt = 0;
}
void NAG_CALL tidy_subset_data(calc_subset_data * cs)
{
/* Tidy up the data structure used by calc_subset_score */
NAG_FREE(cs->sx);
NAG_FREE(cs->x);
NAG_FREE(cs->y);
NAG_FREE(cs->b);
NAG_FREE(cs->se);
NAG_FREE(cs->cov);
NAG_FREE(cs->res);
NAG_FREE(cs->h);
NAG_FREE(cs->q);
NAG_FREE(cs->p);
NAG_FREE(cs->com_ar);
NAG_FREE(cs->wt);
}
void NAG_CALL calc_subset_score(Integer m, Integer drop, Integer lz,
const Integer z[], Integer la,
const Integer a[], double score[],
Nag_Comm *comm, Integer *info)
{
/* Calculate the score associated with a particular set of feature subsets.
This particular example finds the set, of a given size, of explanatory
variables that best fit a response variable when a linear regression model
is used. Therefore the feature set is the set of all the explanatory
variables and the best set of features is defined as set of explanatory
variables that gives the smallest residual sums of squares. See the
documentation for g02dac for details on linear regression models.
*/
calc_subset_data *cs;
/* Integer scalar and array declarations */
Integer i, j, inv_drop, ip, irank;
/* NAG structures */
NagError fail;
/* Double scalar and array declarations */
double rss, df;
/* Other data types */
Nag_Boolean svd;
/* Initialize the error structure */
INIT_FAIL(fail);
/* Point cs to the data structure stored in comm */
cs = (calc_subset_data *) comm->p;
/* Keep track of the number of susbets evaluated */
cs->cnt += MAX(1, la);
/* Set up the initial feature set.
If drop = 0, this is the Null set (i.e. no features).
If drop = 1 then this is the full set (i.e. all features) */
for (i = 0; i < m; i++)
cs->sx[i] = drop;
/* Add (if drop = 0) or remove (if drop = 1) the all the features specified
in Z (note: z refers to its features with values 1 to M, therefore you
must subtract 1 when using z as an array reference) */
inv_drop = (drop == 0) ? 1 : 0;
for (i = 0; i < lz; i++)
cs->sx[z[i] - 1] = inv_drop;
/* Loop over all the elements in a, looping at least once */
for (i = 0; i < MAX(la, 1); i++) {
if (la > 0) {
if (i > 0) {
/* Reset the feature altered at the last iteration */
cs->sx[a[i - 1] - 1] = drop;
}
/* Add or drop the I'th feature in A */
cs->sx[a[i] - 1] = inv_drop;
}
/* Calculate number of parameters in the regression model */
for (ip = j = 0; j < m; j++)
ip += (cs->sx[j] == 1);
/* Fit the regression model (g02dac) */
nag_regsn_mult_linear(cs->mean, cs->n, cs->x, cs->tdx, m, cs->sx, ip,
cs->y, cs->wt, &rss, &df, cs->b, cs->se, cs->cov,
cs->res, cs->h, cs->q, cs->tdq, &svd, &irank, cs->p,
cs->tol, cs->com_ar, &fail);
/* Return the score (the residual sums of squares) */
score[i] = rss;
}
/* If g02dac fails, terminate early */
if (fail.code != NE_NOERROR)
*info = 1;
}