/* nag_best_subset_given_size_revcomm (h05aac) Example Program.
*
* Copyright 2017 Numerical Algorithms Group.
*
* Mark 26.1, 2017.
*/
/* Pre-processor includes */
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <nag.h>
#include <nag_stdlib.h>
#include <nagg02.h>
#include <nagh05.h>

#define BZ(I,J) bz[(J)*(m - ip) + (I)]

typedef struct
{
Integer n, tdq, tdx;
double tol;
double *x, *y, *b, *cov, *h, *p, *q, *res, *se, *wt, *com_ar;
Integer *sx;
Nag_IncludeMean mean;
} calc_subset_data;

void read_subset_data(Integer m, calc_subset_data * cs);
void tidy_subset_data(calc_subset_data * cs);
void calc_subset_score(Integer m, Integer drop, Integer lz, const Integer z[],
Integer la, const Integer a[], double bscore[],
calc_subset_data * cs);

int main(void)
{
int exit_status = 0;

/* Integer scalar and array declarations */
Integer cnt, drop, i, ip, irevcm, j, la, licomm, lrcomm, lz, m,
mincnt, mincr, mip, nbest;
Integer *a = 0, *bz = 0, *ibz = 0, *icomm = 0, *z = 0, *id = 0;

/* NAG structures */
NagError fail;

/* Double scalar and array declarations */
double gamma;
double *bscore = 0, *rcomm = 0;
double acc;

/* Other data types */
calc_subset_data cs;

/* Initialize the error structure */
INIT_FAIL(fail);

printf("nag_best_subset_given_size_revcomm (h05aac) Example Program Results");
printf("\n\n");
/* Skip heading in data file */
scanf("%*[^\n] %*[^\n] ");

/* Read in the problem size */
scanf("%" NAG_IFMT "%" NAG_IFMT "%" NAG_IFMT "%*[^\n] ", &m, &ip, &nbest);

/* Read in the control parameters for the subset selection */
scanf("%" NAG_IFMT "%" NAG_IFMT "%lf%lf%lf%*[^\n] ", &mincr, &mincnt,
&gamma, &acc, &acc);

/* Read in the data required for the score function */

/* Allocate memory required by the subset selection routine */
mip = m - ip;
if (!(z = NAG_ALLOC(mip, Integer)) ||
!(a = NAG_ALLOC(MAX(nbest, m), Integer)) ||
!(bz = NAG_ALLOC(mip * nbest, Integer)) ||
!(bscore = NAG_ALLOC(MAX(nbest, m), double)))
{
printf("Allocation failure\n");
exit_status = -1;
goto END;
}

/* Allocate dummy communication arrays, as we will query the required size */
licomm = 2;
lrcomm = 0;
if (!(icomm = NAG_ALLOC(licomm, Integer)) ||
!(rcomm = NAG_ALLOC(lrcomm, double)))
{
printf("Allocation failure\n");
exit_status = -2;
goto END;
}

/* Query the required length for the communication arrays */
irevcm = 0;
nag_best_subset_given_size_revcomm(&irevcm, mincr, m, ip, nbest, &drop,
&lz, z, &la, a, bscore, bz, mincnt,
gamma, acc, icomm, licomm, rcomm, lrcomm,
&fail);

/* Extract the required sizes from the communication array */
licomm = icomm;
lrcomm = icomm;

/* Reallocate communication arrays to the correct size */
NAG_FREE(icomm);
NAG_FREE(rcomm);
if (!(icomm = NAG_ALLOC(licomm, Integer)) ||
!(rcomm = NAG_ALLOC(lrcomm, double)))
{
printf("Allocation failure\n");
exit_status = -3;
goto END;
}

/* Initialize reverse communication control flag */
irevcm = 0;

/* Call the reverse communication best subset routine
(nag_best_subset_given_size_revcomm (h05aac)) in a loop.
The loop should terminate when irevcm = 0 on exit */
for (cnt = 0;;) {
nag_best_subset_given_size_revcomm(&irevcm, mincr, m, ip, nbest, &drop,
&lz, z, &la, a, bscore, bz, mincnt,
gamma, acc, icomm, licomm, rcomm,
lrcomm, &fail);
if (irevcm == 0) {
if (fail.code != NE_NOERROR) {
printf("Error from nag_best_subset_given_size_revcomm (h05aac).\n%s\n",
fail.message);
if (fail.code != NE_ARG_TOO_LARGE) {
exit_status = 1;
goto END;
}
}
/* Jump out of the loop */
break;
}

/* Calculate and return the score for the required models and keep track of
the number of subsets evaluated */
cnt += MAX(1, la);
calc_subset_score(m, drop, lz, z, la, a, bscore, &cs);
}

/* Titles */
printf("    Score        Feature Subset\n");
printf("    -----        --------------\n");

/* Display the best subsets and corresponding scores.
* nag_best_subset_given_size_revcomm returns a list of features excluded
* from the best subsets, so this is inverted to give the set of features
* included in each subset.
*/
if (!(id = NAG_ALLOC(m, Integer)))
{
printf("Allocation failure\n");
exit_status = -1;
goto END;
}
for (i = 0; i < la; i++) {
printf("%12.5e ", bscore[i]);
for (j = 0; j < m; j++)
id[j] = 1;
for (j = 0; j < mip; j++)
id[BZ(j, i) - 1] = 0;
for (j = 0; j < m; j++)
if (id[j])
printf(" %5" NAG_IFMT, j + 1);
printf("\n");
}
printf("\n");
if (fail.code == NE_TOO_MANY) {
printf("%" NAG_IFMT " subsets of the requested size do not exist,"
" only %" NAG_IFMT " are displayed.\n", nbest, la);
}
printf("%" NAG_IFMT " subsets evaluated in total\n", cnt);

END:
NAG_FREE(z);
NAG_FREE(a);
NAG_FREE(bz);
NAG_FREE(ibz);
NAG_FREE(bscore);
NAG_FREE(icomm);
NAG_FREE(rcomm);
NAG_FREE(id);

tidy_subset_data(&cs);

return exit_status;
}

#define CSX(I, J) cs->x[(I) * cs->tdx + J]
void read_subset_data(Integer m, calc_subset_data * cs)
{
/* Read in the data, from stdout, and allocate any arrays that are required
by the scoring function calc_subset_score */
Integer i, j;
cs->sx = 0;
cs->x = cs->y = cs->b = cs->se = cs->cov = cs->res = cs->h = cs->q = 0;
cs->p = cs->com_ar = cs->wt = 0;

/* Skip heading in data file */
scanf("%*[^\n] ");

/* Read in the number of observations for the data used in the linear
regression */
scanf("%" NAG_IFMT "%*[^\n] ", &cs->n);

/* Read in the control parameters for the linear regression */
scanf("%lf%*[^\n] ", &cs->tol);

/* Read in the data */
cs->tdx = m;

if (!(cs->x = NAG_ALLOC(cs->tdx * cs->n, double)) ||
!(cs->y = NAG_ALLOC(cs->n, double)))
{
printf("Allocation failure\n");
tidy_subset_data(cs);
exit(-1);
}

/* Read in the data for the linear regression */
for (i = 0; i < cs->n; i++) {
for (j = 0; j < m; j++)
scanf("%lf", &CSX(i, j));
scanf("%lf%*[^\n] ", &cs->y[i]);
}

/* No intercept term and no weights */
cs->mean = Nag_MeanZero;

/* Allocate memory required by the regression routine */
cs->tdq = cs->n;
if (!(cs->sx = NAG_ALLOC(m, Integer)) ||
!(cs->b = NAG_ALLOC(m, double)) ||
!(cs->se = NAG_ALLOC(m, double)) ||
!(cs->cov = NAG_ALLOC(m * (m + 1) / 2, double)) ||
!(cs->res = NAG_ALLOC(cs->n, double)) ||
!(cs->h = NAG_ALLOC(cs->n, double)) ||
!(cs->q = NAG_ALLOC(cs->n * cs->tdq, double)) ||
!(cs->p = NAG_ALLOC(2 * m + m * m, double)) ||
!(cs->com_ar = NAG_ALLOC(5 * (m - 1) * m * m, double)))
{
printf("Allocation failure\n");
tidy_subset_data(cs);
exit(-1);
}
}

void tidy_subset_data(calc_subset_data * cs)
{
/* Tidy up the data structure used by calc_subset_score */
NAG_FREE(cs->sx);
NAG_FREE(cs->x);
NAG_FREE(cs->y);
NAG_FREE(cs->b);
NAG_FREE(cs->se);
NAG_FREE(cs->cov);
NAG_FREE(cs->res);
NAG_FREE(cs->h);
NAG_FREE(cs->q);
NAG_FREE(cs->p);
NAG_FREE(cs->com_ar);
NAG_FREE(cs->wt);
}

void calc_subset_score(Integer m, Integer drop, Integer lz, const Integer z[],
Integer la, const Integer a[], double bscore[],
calc_subset_data * cs)
{
/* Calculate the score associated with a particular set of feature subsets.
m,drop,lz,z,la,a and bscore are all as described in the documentation
of nag_best_subset_given_size_revcomm (h05aac).
cs - Structure of type calc_subset_data that holds any additional data
required by this routine

This particular example finds the set, of a given size, of explanatory
variables that best fit a response variable when a linear regression model
is used. Therefore the feature set is the set of all the explanatory
variables and the best set of features is defined as set of explanatory
variables that gives the smallest residual sums of squares. See the
documentation for g02dac for details on linear regression models.
*/

/* Integer scalar and array declarations */
Integer i, j, inv_drop, ip, irank;

/* NAG structures */
NagError fail;

/* Double scalar and array declarations */

/* Other data types */
Nag_Boolean svd;

/* Initialize the error structure */
INIT_FAIL(fail);

/* Set up the initial feature set.
If drop = 0, this is the Null set (i.e. no features).
If drop = 1 then this is the full set (i.e. all features) */
for (i = 0; i < m; i++)
cs->sx[i] = drop;

/* Add (if drop = 0) or remove (if drop = 1) the all the features specified
in Z (note: z refers to its features with values 1 to M, therefore you
must subtract 1 when using z as an array reference) .
*/
inv_drop = (drop == 0) ? 1 : 0;
for (i = 0; i < lz; i++)
cs->sx[z[i] - 1] = inv_drop;

/* Loop over all the elements in a, looping at least once */
for (i = 0; i < MAX(la, 1); i++) {
if (la > 0) {
if (i > 0) {
/* Reset the feature altered at the last iteration */
cs->sx[a[i - 1] - 1] = drop;
}

/* Add or drop the I'th feature in A */
cs->sx[a[i] - 1] = inv_drop;
}

/* Calculate number of parameters in the regression model */
for (ip = j = 0; j < m; j++)
ip += (cs->sx[j] == 1);

/* Fit the regression model (g02dac) */
nag_regsn_mult_linear(cs->mean, cs->n, cs->x, cs->tdx, m, cs->sx,
ip, cs->y, cs->wt, &rss, &df, cs->b, cs->se,
cs->cov, cs->res, cs->h, cs->q, cs->tdq, &svd,
&irank, cs->p, cs->tol, cs->com_ar, &fail);

/* Return the score (the residual sums of squares) */