#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>

/* 
   Program "hwmetc.c"          (17/06/97)
   Updated for ANSI C; 18/07/00   - to compile type  cc hwmetc.c -lm

   Karen L. Ayres               
   University of Reading 
   e-mail: k.l.ayres@reading.ac.uk 
   
   Metropolis-Hastings algorithm for obtaining a sample from the
   posterior distribution of the allele frequencies and the inbreeding
   coefficient f, for the Hardy-Weinberg inbreeding model
   (e.g. Malecot, 1969):

   p_ii=p_i(f+(1-f)p_i)
   p_ij=2p_ip_j(1-f)

   The method is detailed in Ayres & Balding, Heredity 1998

   Combines information from several loci, leading to sharper estimation.
   This assumes gametic equilibrium, no genotypic association etc.

   Allows for negative values of f, provided that the condition
   p_ii>=0 is satisfied for each locus - for only non-negative values of f
   the program "hwmetc2.c" should be used for combining over loci.

   Reads from input file "hwmet.inp", which is a data file
   whose first line contains the integer 1, to represent that the
   resulting data is that for the first locus, and whose second line
   CONTAINS ONLY AN INTEGER WHICH IS THE NUMBER OF ALLELES IN THE
   RESULTING DATA SET - this is stored in "nall[loc]".
   The following lines in the input file should consist of a "nall[loc]"
   by "nall[loc]" matrix of observed genotype counts - counts for
   homozygous genotypes are on the diagonal, heterozygous genotypes
   are off the diagonal. The next line should then contain a 2, followed
   by the same format as for locus 1 etc. e.g.
   1 2
   12 34
   0 7
   2 2
   21 17
   0 19

   Outputs "f" parameter values to file "hwmet.out", and information
   about the implementation to "hwmet.stats".

   The observed genotype counts are stored in "n".

   The p[loc][i] each have a multivariate uniform prior (independent for each 
   locus) and f has a Beta(a,b) prior conditional on the vectors p (since 
   the permissable range of f is dependent on the p's) on the interval 
   (A,1), where A=-pmin/(1-pmin), and pmin is the smallest value in the 
   combined p vectors (assuming all p[i] are non-zero), with parameters
   a = 1-A.(b-1)
   b = g_t(pmin)
   and where g_t(*) is a polynomial function of p_min, whose form is
   determined by the value "t", which represents where the right-hand
   tail of the prior density curve is to become approximately 0. This
   program currently caters for three values of "t" - 0.1, 0.2, and 0.4,
   which are likely to be useful for many human populations since they
   give most support to values of f close to zero. The conditional priors
   all have mode at zero, which may be reasonable a priori. (If these
   priors are not reasonable, it may be preferable to apply the program
   "hwmetc2.c" which restricts f to non-negative values and implements
   a standard Beta prior.)

   The values of "t" in the program are coded as follows:

   for t=0.1  : set btail = 1
   for t=0.2  : set btail = 2
   for t=0.4  : set btail = 4
   for a uniform (i.e. non-informative) prior  : set btail = 0
   
   The parameter "btail" is defined at the beginning of the program,
   and should be set to one of the above integers accordingly.

   A Metropolis-Hastings algorithm is run for "numit" iterations,
   the first "discard" of which are discarded as the burn-in.
   Subsequently, every "kp"th iteration is output.
   Note that "numit" is determined by the values of "kp", "discard"
   and "nout", where "nout" is the number of values desired to be
   output.

   The limiting bound "epsln" (>0) can be changed as necessary, in order 
   to obtain reasonable acceptance rates.

   Density curves can be produced within a suitable statistical computer
   package by applying density estimation to the output from this
   program.
*/

#define btail 0  /* determines right-hand tail of conditional beta prior of F */
#define epsln 0.02  /* limit on amount to update p's by (< 0.5 and > 0) */

#define maxloc 6  /* maximum number of loci allowed */
#define maxall 15  /* maximum number of alleles allowed at a locus */
#define kp 50     /* interval between successive output values */
#define discard 5000L    /* "burn-in" value of the chain */
#define nout 10000L     /* number of output values required */
#define numit (nout*kp+discard) /* number of iterations of alg. */
#define acrt (numit/50)  /* interval between output of acceptance rates */
#define dim1 ((maxall*(maxall+1))/2)

typedef double pvec[maxloc][maxall];

int     i, j, k, nn, loc, nsam[maxloc], nall[maxloc], ngen[maxloc], nloc, nallmax;
int     x[maxloc][maxall][maxall], n[maxloc][dim1], p1[maxloc], p2[maxloc];
long    l1, jmp, ix, iy, iz;
double  y, a1[maxloc], a2[maxloc], b1[maxloc], b2[maxloc], b3, b4, a3, a4;
double  r, pmin, newpmin, f, newf, epsln2, genf[dim1], z, newz, gsum;
pvec    p, newp;
FILE    *inp, *out1, *out2, *out3;

double WichHill()
{
/* Wichmann & Hill random number generator (Algorithm AS183 in Applied
   Statistics 31 (1982))
*/

ix=(171*ix)%30269;
iy=(172*iy)%30307;
iz=(170*iz)%30323;
return fmod((ix/30269.0)+(iy/30307.0)+(iz/30323.0),1.0);
}

void inpdat()
{
out1=fopen("hwmet.stats","w");
fprintf(out1,"Number of iterations %8ld\n",numit);
inp=fopen("hwmet.inp","r");
while ((nn=fscanf(inp,"%d",&nloc))!=EOF){
	fscanf(inp,"%d",&nall[nloc-1]);

/* Check for invalid input file */

	if (nloc>maxloc){
		fprintf(out1,"ERROR - TOO MANY LOCI - CHANGE MAXLOC\n");
		exit(1);
	}
	if (nall[nloc-1]>maxall){
		fprintf(out1,"ERROR - TOO MANY ALLELES - CHANGE MAXALL\n");
		exit(1);
	}

	ngen[nloc-1]=(nall[nloc-1]*(nall[nloc-1]+1))/2;
	for (i=0;i<nall[nloc-1];++i){
		for (j=0,k=0;j<nall[nloc-1];++j){
			fscanf(inp,"%d",&x[nloc-1][i][j]);
			nsam[nloc-1]+= x[nloc-1][i][j];
			k+= x[nloc-1][i][j]+x[nloc-1][j][i];
		}
		if (k==0){
			fprintf(out1,"WARNING - unobserved allele %d at locus %d enforces f >=0\n",i+1,nloc);
		}
	}
}

/* observed genotype counts, using genotype identifier for ij: i+(j*(j+1))/2
   where i<=j (modified for i=0,...,k-1 from that given for i=1,...,k in
   Zaykin, Zhivotovsky & Weir (1995) - Genetica 96)
*/

for (loc=0;loc<nloc;++loc){
	for (i=0;i<nall[loc];++i){
		for (j=i;j<nall[loc];++j){
			n[loc][i+(j*(j+1))/2]=x[loc][i][j];
			if (i!=j){
				n[loc][i+(j*(j+1))/2]+= x[loc][j][i];
			}
		}
	}
}

/* set value of epsln2, the limiting bound for the updating distribution for f
   -- setting this > [(nallmax^2).epsln]/[(nallmax-1).(nallmax-1-nallmax.epsln)]
   ensures valid convergence of the Markov chain
*/

if (epsln>=0.5){
	fprintf(out1,"ERROR - LIMITING BOUND FOR P INVALID - CHANGE EPSLN (<0.5)\n");
	exit(1);
}
nallmax=2;
for (loc=0;loc<nloc;++loc){
	if (nall[loc]>nallmax){
		nallmax=nall[loc];
	}
}
epsln2=epsln*nallmax*nallmax/((nallmax-1.0)*(nallmax-1.0-nallmax*epsln))+10e-10;

/* output statistics to file */

fprintf(out1,"Limiting bound for updating of all p_i ... %6.3f\n",epsln);
fprintf(out1,"Limiting bound for updating of f   ... %6.4f\n\n",epsln2);
fprintf(out1,"Multivariate uniform prior for each allele frequency vector\n");
if (btail==0){
	fprintf(out1,"Conditional prior for f - uniform\n");
}
else if (btail==1){
	fprintf(out1,"Conditional prior for f - beta with right-hand tail out to 0.1 approx.\n");
}
else if (btail==2){
	fprintf(out1,"Conditional prior for f - beta with right-hand tail out to 0.2 approx.\n");
}
else if (btail==4){
	fprintf(out1,"Conditional prior for f - beta with right-hand tail out to 0.4 approx.\n");
}
else{
	fprintf(out1,"ERROR - INVALID TAIL PARAMETER FOR BETA PRIOR - CHANGE BTAIL (0,1,2,4)\n");
	exit(1);
}
fprintf(out1,"\nSample sizes:\n");
for (loc=0;loc<nloc;++loc){
	fprintf(out1,"Locus %3d, nsam= %6d\n",loc+1,nsam[loc]);
}
fprintf(out1,"Acceptance rates\n");

fclose(inp);
fflush(out1);
}

double loggf(double xx)
{
/* Adapted from the underlying algorithm in Numerical Recipes in C (1988)
   [by Press et al., published by Camridge University Press], for 
   calculating the log of the gamma function
*/

double lg=(xx+0.5)*log(xx+5.5)-xx-5.5, const1=1.000000000190015;

const1+= 76.1800917297146/(xx+1.0);
const1+= 24.01409824083091/(xx+3.0);
const1+= 0.001208650973866179/(xx+5.0);
const1-= 86.50532032941677/(xx+2.0);
const1-= 1.2317395572450155/(xx+4.0);
const1-= 0.000005395239384953/(xx+6.0);

lg+= log(2.5066282746310005*const1/xx);

return lg;
}

double lbetad(double fs, double pms)
{
/* evaluate log of the conditional prior density for "fs" */

double blb, bpa, bpb;

blb= -pms/(1.0-pms);
if (btail==0){
	bpb=1.0;
}
else if (btail==1){
	bpb=61.37+1482.08*pms-1643.18*pow(pms,2.0)+41521.78*pow(pms,3.0)-298223.0*pow(pms,4.0)+723583.9*pow(pms,5.0)-579512.0*pow(pms,6.0);
}
else if (btail==2){
	bpb=23.61+241.3*pms-1048.65*pow(pms,2.0)+8944.7*pow(pms,3.0)-20820.2*pow(pms,4.0)+14112.95*pow(pms,5.0);
}
else if (btail==4){
	bpb=11.15+29.75*pms+773.84*pow(pms,2.0)-4409.76*pow(pms,3.0)+9156.72*pow(pms,4.0)-6515.75*pow(pms,5.0);
}
bpa= 1.0-blb*(bpb-1.0);

return (-log(1.0-blb)+loggf(bpa+bpb)-loggf(bpa)-loggf(bpb)+(bpa-1.0)*log((fs-blb)/(1.0-blb))+(bpb-1.0)*log((1.0-fs)/(1.0-blb)));
}

double rstgam(double alpha1)
{
/* Cheng & Feast algorithm (algorithm 3.20 in Ripley, 1988) for generating
   a standard Gamma random variable with shape parameter alpha>1
*/

int     acpt;
double  c1, c2, c3, c4, c5, u1, u2, w;

c1=alpha1-1.0;
c2=(alpha1-(1.0/(6.0*alpha1)))/c1;
c3=2.0/c1;
c4=c3+2.0;
c5=1.0/sqrt(alpha1);
do{
	do{
		u1=WichHill();
		u2=WichHill();
		if (alpha1>2.5){
			u1=u2+c5*(1.0-(1.86*u1));
		}
	}
	while (u1>=1.0 || u1<=0);

	w=(c2*u2)/u1;
	if ((c3*u1+w+(1.0/w))<=c4){
		acpt=1;
	}
	else if ((c3*log(u1)-log(w)+w)>=1.0){ /* repeat outer "do" loop */
		acpt=0;
	}
	else{
		acpt=1;
	}
	}
while (acpt!=1);

return c1*w;
}

double rbetaf()
{
/* returns a Beta random variable (from the prior distribution) to
   initialise f, with parameters determined by the minimum of the allele
   frequencies "pmin", on the interval [-pmin/(1-pmin),1]
*/

double rgm1, rgm2, bpa, bpb, blb;

blb= -pmin/(1.0-pmin);

if (btail==0){
	bpb=1.0;
}
else if (btail==1){
	bpb=61.37+1482.08*pmin-1643.18*pow(pmin,2.0)+41521.78*pow(pmin,3.0)-298223.0*pow(pmin,4.0)+723583.9*pow(pmin,5.0)-579512.0*pow(pmin,6.0);
}
else if (btail==2){
	bpb=23.61+241.3*pmin-1048.65*pow(pmin,2.0)+8944.7*pow(pmin,3.0)-20820.2*pow(pmin,4.0)+14112.95*pow(pmin,5.0);
}
else if (btail==4){
	bpb=11.15+29.75*pmin+773.84*pow(pmin,2.0)-4409.76*pow(pmin,3.0)+9156.72*pow(pmin,4.0)-6515.75*pow(pmin,5.0);
}
bpa= 1.0-blb*(bpb-1.0);

if (bpa>1.0){
	rgm1=rstgam(bpa);
}
else if (bpa<1.0){    /* Ex. 3.22 in Ripley(1988) for alpha<1 */
	rgm1=rstgam(bpa+1.0)*pow(WichHill(),1.0/bpa);
}
else {    /* Exponential */
	rgm1=(-log(WichHill()));
}
if (bpb>1.0){
	rgm2=rstgam(bpb);
}
else if (bpb<1.0){    /* Ex. 3.22 in Ripley(1988) for alpha<1 */
	rgm2=rstgam(bpb+1.0)*pow(WichHill(),1.0/bpb);
}
else {    /* Exponential */
	rgm2=(-log(WichHill()));
}

return (rgm1/(rgm1+rgm2))*(1.0-blb)+blb;
}

double llkhd(int l,double fs,pvec ps)
{
/* evaluates the log of the joint likelihood of "fs" and "ps" for
   locus "l" (exlcuding multinomial constant)
*/

double  lld=0.0;
for (i=0;i<nall[l];++i){
	for (j=i+1;j<nall[l];++j){
		genf[i+(j*(j+1))/2]=2.0*(1-fs)*ps[l][i]*ps[l][j];
	}
}
for (i=0;i<nall[l];++i){
	genf[(i*(i+3))/2]=ps[l][i]*(fs+(1-fs)*ps[l][i]);
}

/* log likelihood */

for (i=0;i<ngen[l];++i){
	if (n[l][i]!=0){
		lld+= n[l][i]*log(genf[i]);
	}
}
return lld;
}

main()
{
srand((unsigned) time(NULL));
out2=fopen("hwmet.out","w");

/* initialise seeds for random number generator */

ix=(long)(rand()%30000)+1;
iy=(long)(rand()%30000)+1;
iz=(long)(rand()%30000)+1;

inpdat();

/* initialise "p" and "f" values */

for (loc=0;loc<nloc;++loc){
	gsum=0;
	for (i=0;i<nall[loc];++i){
		p[loc][i]= (-log(WichHill()));
		gsum+= p[loc][i];
	}
	for (i=0;i<nall[loc];++i){
		p[loc][i]=p[loc][i]/gsum;
	}
}

pmin=0.5;
for (loc=0;loc<nloc;++loc){
	for (i=0;i<nall[loc];++i){
		if (p[loc][i]<pmin){
			pmin=p[loc][i];
		}
	}
}

f=rbetaf();

/* initialise density value "z" */

z=lbetad(f,pmin);
for (loc=0;loc<nloc;++loc){
	z+= llkhd(loc,f,p);
}

/* begin Metropolis-Hastings algorithm */

for (l1=1;l1<=numit;++l1){

/* choose new candidate values "newp" and "newf" */

	for (loc=0;loc<nloc;++loc){
		p1[loc]=(int)(WichHill()*nall[loc]);
		do {
			p2[loc]=(int)(WichHill()*nall[loc]);
		}
		while (p2[loc]==p1[loc]);

		b1[loc]=(p[loc][p1[loc]]+epsln<p[loc][p1[loc]]+p[loc][p2[loc]])?p[loc][p1[loc]]+epsln:p[loc][p1[loc]]+p[loc][p2[loc]];
		if (b1[loc]>1.0){  /* can only occur through rounding error */
			b1[loc]=1.0;
		}
		a1[loc]=(p[loc][p1[loc]]-epsln>0)?p[loc][p1[loc]]-epsln:0;
		for (i=0;i<nall[loc];++i){
			newp[loc][i]=p[loc][i];
		}
		newp[loc][p1[loc]]= (b1[loc]-a1[loc])*WichHill()+a1[loc];
		newp[loc][p2[loc]]-= newp[loc][p1[loc]]-p[loc][p1[loc]];
	}
	newpmin=0.5;
	for (loc=0;loc<nloc;++loc){
		for (i=0;i<nall[loc];++i){
			if (newp[loc][i]<newpmin){
				newpmin=newp[loc][i];
			}
		}
	}

	b3=(1.0<f+epsln2)?1.0:f+epsln2;
	a3=(-newpmin/(1.0-newpmin)>f-epsln2)?-newpmin/(1.0-newpmin):f-epsln2;
	newf=a3+WichHill()*(b3-a3);

/* evaluate density "newz" at candidate values */

	newz=lbetad(newf,newpmin);
	for (loc=0;loc<nloc;++loc){
		newz+= llkhd(loc,newf,newp);
	}

/* decide whether or not to accept candidate values */

	for (loc=0;loc<nloc;++loc){
		b2[loc]=(newp[loc][p1[loc]]+epsln<newp[loc][p1[loc]]+newp[loc][p2[loc]])?newp[loc][p1[loc]]+epsln:newp[loc][p1[loc]]+newp[loc][p2[loc]];
		if (b2[loc]>1.0){  /* can only occur through rounding error */
			b2[loc]=1.0;
		}
		a2[loc]=(newp[loc][p1[loc]]-epsln>0)?newp[loc][p1[loc]]-epsln:0;
	}

	b4=(1.0<newf+epsln2)?1.0:newf+epsln2;
	a4=(-pmin/(1.0-pmin)>newf-epsln2)?-pmin/(1.0-pmin):newf-epsln2;

	r=newz-z+log(b3-a3)-log(b4-a4);
	for (loc=0;loc<nloc;++loc){
		r+= log(b1[loc]-a1[loc])-log(b2[loc]-a2[loc]);
	}

	if (log(WichHill())<r){ /* accept candidate values */
		pmin=newpmin;
		z=newz;
		for (loc=0;loc<nloc;++loc){
			for (i=0;i<nall[loc];++i){
				p[loc][i]=newp[loc][i];
			}
		}
		f=newf;
		jmp++;
	}

/* output */

	if (((l1/kp)*kp==l1) && (l1>discard)){
		fprintf(out2,"%10.6f\n",f);
		fflush(out2);
	}

	if ((l1/acrt)*acrt==l1){     /* output acceptance rate */
		fprintf(out1,"%12.6f\n",((double)(jmp))/l1);
		fflush(out1);
	}
}

fclose(out1);
fclose(out2);
return 0;
}
