#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>

/*
	Program "ldmet.c"               (02/03/98)

      Karen L. Ayres               
      University of Reading 
      e-mail: k.l.ayres@reading.ac.uk 

	OVERVIEW ...

	Metropolis-Hastings algorithm for obtaining a sample from the
	posterior probability distribution of TWO-LOCUS haplotype frequencies
	f_ij given genotype data (where phase cannot be determined).
	Given estimates of f_ij, marginal posterior distributions of
	disequilibrium coefficients D_ij based upon them can be
	obtained. For 3 loci, use ldmet3.c.

	This algorithm outputs a sample from the distribution of f_ij,
	D_ij, D'_ij, D' among other things (see section below on output).
	Density curves can then be produced within a suitable statistical 
	computer package by applying density estimation to the output from this
	program.

	METHOD ...

	(The method used is fully detailed in Ayres & Balding (2000), Genetics 157)

	A Markov chain Monte Carlo (MCMC) algorithm is implemented in
	order to sample from the posterior of haplotype frequencies f_ij. 
	The Bayesian approach requires a prior for f_ij as well as the likelihood.

	The joint likelihood is assumed to be multinomial, hence = product 
	of the genotype frequencies p_ii'jj' to the power of the observed 
	frequencies n_ii'jj', where the expected genotype frequencies are given 
	by the random union of gametes model:

	p_iijj  =(f_ij)^2
	p_ii'jj =2(f_ij)(f_i'j)
	p_iijj' =2(f_ij)(f_ij')
	p_ii'jj'=2(f_ij)(f_i'j')+2(f_i'j)(f_ij')

	The prior for f_ij implemented here is Dirichlet with parameters
	K*x*w, where x and w are vectors of hyperparameters encompassing
	some prior information about allele frequencies at locus 1 and 2
	respectively. In the paper we set K=I*J where I is number of alleles
	at first locus, J number of alleles at second. We implement
	independent multivariate uniform (Diric[1,..,1]) for hyperparameter
	vectors x and w (note that x and w are parameters to be estimated
	in the algorithm). See the paper for more details on this hierarchical
	prior.

   	The Metropolis-Hastings algorithm is run for "numit" iterations,
	the first "discard" of which are discarded as the burn-in. 
      Subsequently, every "kp"th iteration is output.
   	Note that "numit" is determined by the values of "kp", "discard"
   	and "nout", where "nout" is the number of values desired to be
   	output. For details on MCMC, such as determining suitable values
	for the burn-in and thinning interval (kp) by assessing convergence
	see e.g. "Markov chain Monte Carlo" by D. Gamerman.

	The f_ij are updated by choosing two fs to update, f_ab and f_cd
	say, then choosing new f'_ab uniformly in an interval centred on
	the current f_ab, with width of interval determined by "epsln" (>0).
	To ensure f_ij sum to 1, the new f'_cd is determined by the other
	f'_ij (=f_ij) and f'_ab. The x and w are updated similarly (using
	"epsln2")

  	The limiting bounds "epsln" amd "epsln2" can be changed as necessary, 
	in order to obtain reasonable acceptance rates.

	PARAMETERS TO BE SPECIFIED ...

	Users should specify the following (change the values given
	in the #define declarations below, at the beginning of the
	program code) ...

	maxall   - maximum number of alleles allowed at each locus 
	kp       - thinning interval for MCMC algorithm
	discard  - number of MCMC iterations to initially discard as burn-in
	nout     - number of iterations to output (number of values sampled from posterior)
	epsln    - limit for updating distribution of f_ij
	epsln2   - limit for updating dist of hyperparameter values x, w
	acrt     - specify c for (numit/c), to determine how often to output
		     acceptance rates to stats file
	
	INPUT/OUTPUT FORMAT ...

	Reads from input file "ldmet.inp", which is a data file
	where each line contains 5 integers, the first two representing
	the alleles for the first locus (i and i') the second two
	representing the alleles at the second locus (j and j') and the
	final being the observed genotype count for that genotype (ii'jj')
	in the data, e.g.

	0 0 0 0 12
      0 0 0 1 22
	0 1 0 1 8

	The program outputs monitoring values to "ldmet.stats" such
	as acceptance rates. Samples from the posterior are output 
      to ldmet?.out in the order of haplotypes e.g. (for 3 alleles 
	at each locus) f_00 f_01 f_02 f_10 f_11 f_12 f_20 f_21 f_22 
	Files are: ldmetf.out for f_ij, ldmetd.out for d_ij, ldmetdpi.out
	for d'_ij, ldmetdp.out for D' (Hedrick's summary measure),
	ldemtxw.out for x_i and w_j (hyperparameters). Other disequilbrium	
	measures can be calculated directly from the output f_ij if
	required.

	IMPORTANT NOTE ... If there are I alleles at one locus and J alleles
	at another, the program outputs "nout" x I x J values to the
	file "ldmet1.out". If I and J are large, this file will also
	be very large and may require too much storage space. For I
	and J greater than about 5, it is advisable to reduce "nout".
*/

/* define values to choose in algorithm */
#define maxall 10  /* maximum number of alleles allowed at each locus */
#define kp 200     /* interval between successive output values */
#define discard 30000L    /* number of values to initially discard */
#define nout 5000L     /* number of output values required */
#define epsln 0.0091  /* limit for updating distribution of parameter values */
#define epsln2 0.009  /* limit for updating dist of hyperparameter values */
#define acrt (numit/20)  /* interval between output of acceptance rates */

#define numit (nout*kp+discard) /* number of iterations of alg. */

typedef double pvec[maxall];
typedef double parvec[maxall][maxall];

int     i, idash, j, jdash, k, nn, nsam, nall1, nall2, inc1, inc2, dec1, dec2;
int     n[maxall][maxall][maxall][maxall];
long    l1, jmp, ix, iy, iz;
double  r, z=0, newz, ddash, cl1, cu1, cl2, cu2, fsum, a3, a4, b3, b4;
double  a1, a2, b1, b2, pconst;
parvec  f, newf, d, dmax, dmin, pprior, ddash2, pprior2;
pvec    p, q, p0, q0, xprior, wprior, w, neww, x, newx;
FILE    *inp, *out1, *out2, *out3, *out4, *out5, *out6;

double WichHill()
{
/* Wichmann & Hill random number generator (Algorithm AS183 in Applied
   Statistics 31 (1982))
*/

ix=(171*ix)%30269;
iy=(172*iy)%30307;
iz=(170*iz)%30323;
return fmod((ix/30269.0)+(iy/30307.0)+(iz/30323.0),1.0);
}

void inpdat()
{
/* read in data from ldmet.inp */

int tmp;

/* open log file */

out1=fopen("ldmet.stats","w");
fprintf(out1,"Number of iterations %8ld\n",numit);

while ((nn=fscanf(inp,"%d%d%d%d",&i,&idash,&j,&jdash))!=EOF){
	fscanf(inp,"%d",&k);
	if (i>idash){
	tmp=idash;
	idash=i;
	i=tmp;
	}
	if (j>jdash){
	tmp=jdash;
	jdash=j;
	j=tmp;
	}

      /* Check for invalid input file */

	if ((idash+1>maxall)||(jdash+1>maxall)){
		fprintf(out1,"ERROR - TOO MANY ALLELES - CHANGE MAXALL\n");
		exit(1);
	}

	if (idash+1>nall1){
		nall1=idash+1;
	}
	if (jdash+1>nall2){
		nall2=jdash+1;
	}
	n[i][idash][j][jdash]+= k;
	nsam+= k;
	p0[i]+= k;
	p0[idash]+= k;
	q0[j]+= k;
	q0[jdash]+= k;
}

for (i=0;i<nall1;++i){
	if (p0[i]==0){
		fprintf(out1,"ERROR - ALLELE %d AT LOCUS 1 IS NULL - RELABEL\n",i+1);
	}
	p0[i]=p0[i]/(2.0*nsam);
}
for (j=0;j<nall2;++j){
	if (q0[j]==0){
		fprintf(out1,"ERROR - ALLELE %d AT LOCUS 2 IS NULL - RELABEL\n",j+1);
	}
	q0[j]=q0[j]/(2.0*nsam);
}

fprintf(out1,"\nAcceptance rates\n");

fclose(inp);
fflush(out1);
}

double loggf(double xx)
{
/* Adapted from the underlying algorithm in Numerical Recipes in C (1988) 
   [by Press et al., published by Cambridge University Press], for 
   calculating the log of the gamma function
*/

double lg=(xx+0.5)*log(xx+5.5)-xx-5.5, const1=1.000000000190015;

const1+= 76.1800917297146/(xx+1.0);
const1+= 24.01409824083091/(xx+3.0);
const1+= 0.001208650973866179/(xx+5.0);
const1-= 86.50532032941677/(xx+2.0);
const1-= 1.2317395572450155/(xx+4.0);
const1-= 0.000005395239384953/(xx+6.0);

lg+= log(2.5066282746310005*const1/xx);

return lg;
}

double rstgam(double alpha1)
{
/* Cheng & Feast algorithm (algorithm 3.20 in Ripley 1988) for
   generating a standard Gamma random variable with shape
   parameter alpha>1 */

int     acpt;
double  c1, c2, c3, c4, c5, u1, u2, w;

c1=alpha1-1.0;
c2=(alpha1-(1.0/(6.0*alpha1)))/c1;
c3=2.0/c1;
c4=c3+2.0;
c5=1.0/sqrt(alpha1);
do{
	do{
		u1=WichHill();
		u2=WichHill();
		if (alpha1>2.5){
			u1=u2+c5*(1.0-(1.86*u1));
		}
	}
	while (u1>=1.0 || u1<=0);

	w=(c2*u2)/u1;
	if ((c3*u1+w+(1.0/w))<=c4){
		acpt=1;
	}
	else if ((c3*log(u1)-log(w)+w)>=1.0){ /* repeat outer "do" loop */
		acpt=0;
	}
	else{
		acpt=1;
	}
}
while (acpt!=1);

return c1*w;
}

void rdiric(parvec pars)
{
/* Generates vector of Dirichlet random variables for nall1 x nall2 matrix (f_ij)
   with parameters in "pars", and stores in "pars" */

int     np1, np2;
double  gsum=0.0;

for (np1=0;np1<nall1;++np1){
	for (np2=0;np2<nall2;++np2){
		if (pars[np1][np2]>1.0){
			pars[np1][np2]=rstgam(pars[np1][np2]);
		}
		else if (pars[np1][np2]<1.0){    /* Ex. 3.22 in Ripley(1988) for alpha<1 */
			pars[np1][np2]=rstgam(pars[np1][np2]+1.0)*pow(WichHill(),1.0/pars[np1][np2]);
		}
		else {    /* Exponential */
			pars[np1][np2]= (-log(WichHill()));
		}
		gsum+= pars[np1][np2];
	}
}
for (np1=0;np1<nall1;++np1){
	for (np2=0;np2<nall2;++np2){
		pars[np1][np2]=pars[np1][np2]/gsum;
	}
}
}

void rdiric2(pvec pars,int ng)
{
/* Generates vector of Dirichlet random variables length ng (x_i) with parameters 
   in "pars", and stores in "pars" */

int     np1;
double  gsum=0.0;

for (np1=0;np1<ng;++np1){
	if (pars[np1]>1.0){
		pars[np1]=rstgam(pars[np1]);
	}
	else if (pars[np1]<1.0){    /* Ex. 3.22 in Ripley(1988) for alpha<1 */
		pars[np1]=rstgam(pars[np1]+1.0)*pow(WichHill(),1.0/pars[np1]);
	}
	else {    /* Exponential */
		pars[np1]= (-log(WichHill()));
	}
	gsum+= pars[np1];
}
for (np1=0;np1<ng;++np1){
	pars[np1]=pars[np1]/gsum;
}
}

double ldiricd(parvec ps,parvec pp)
{
/* evaluate log of Dirichlet prior density (parameters "pp") for the f_ij 
   ("ps") - includes constant */

double ldd=0.0, a0=0.0;

for (i=0;i<nall1;++i){
	for (j=0;j<nall2;++j){
		a0+= pp[i][j];
	}
}
ldd= loggf(a0);
for (i=0;i<nall1;++i){
	for (j=0;j<nall2;++j){
		ldd+= (pp[i][j]-1.0)*log(ps[i][j])-loggf(pp[i][j]);
	}
}
return ldd;
}

double ldiricd2(pvec ps,pvec pp,int ng)
{
/* evaluate log of Dirichlet prior density (parameters "pp") for the x_i (w_j) 
   ("ps", length "ng") - excludes constant */

double ldd=0.0;

for (i=0;i<ng;++i){
	ldd+= (pp[i]-1.0)*log(ps[i]);
}
return ldd;
}

double llkhd(parvec fs)
{
/* evaluates the log of the (multinomial) likelihood for genotypes,
   given random union of gametes model */

double  lld=0.0;

for (i=0;i<nall1;++i){
	for (j=0;j<nall2;++j){
		if (n[i][i][j][j]!=0){
			lld+= n[i][i][j][j]*2.0*log(fs[i][j]);
		}
		for (jdash=j+1;jdash<nall2;++jdash){
			if (n[i][i][j][jdash]!=0){
				lld+= n[i][i][j][jdash]*log(2.0*fs[i][j]*fs[i][jdash]);
			}
			for (idash=i+1;idash<nall1;++idash){
				if (n[i][idash][j][jdash]!=0){
					lld+= n[i][idash][j][jdash]*log(2.0*fs[i][j]*fs[idash][jdash]+2.0*fs[i][jdash]*fs[idash][j]);
				}
			}
		}
		for (idash=i+1;idash<nall1;++idash){
			if (n[i][idash][j][j]!=0){
				lld+= n[i][idash][j][j]*log(2.0*fs[i][j]*fs[idash][j]);
			}
		}
	}
}

return lld;
}

main()
{
/* set seeds for random number generator */
srand((unsigned) time(NULL));
ix=(long)(rand()%30000)+1;
iy=(long)(rand()%30000)+1;
iz=(long)(rand()%30000)+1;

/* open input/output files */
inp=fopen("ldmet.inp","r");
if (!inp){printf("Error - inp could not be opened\n");exit(1);}
out2=fopen("ldmetdp.out","w");
out3=fopen("ldmetf.out","w");
out4=fopen("ldmetd.out","w");
out5=fopen("ldmetxw.out","w");
out6=fopen("ldmetdpi.out","w");
if (!out2){printf("Error - out2 could not be opened\n");exit(1);}
if (!out3){printf("Error - out3 could not be opened\n");exit(1);}
if (!out4){printf("Error - out4 could not be opened\n");exit(1);}
if (!out5){printf("Error - out5 could not be opened\n");exit(1);}
if (!out6){printf("Error - out6 could not be opened\n");exit(1);}

/* read in data */

inpdat();

/* set prior parameters for hyperparameters X and W: equal 1.0 specifies
   multivariate uniform */

for (i=0;i<nall1;++i){
	xprior[i]=1.0;
}
for (j=0;j<nall2;++j){
	wprior[j]=1.0;
}

/* initialize hyperparameters by sampling from prior */

for (i=0;i<nall1;++i){
	x[i]=xprior[i];
}
for (j=0;j<nall2;++j){
	w[j]=wprior[j];
}
rdiric2(x,nall1);
rdiric2(w,nall2);

/* set prior parameters for Dirichlet prior of f_ij */

/* as in paper, set multiplication constant for prior parameters to
   be number of alleles (loc1) * number of alleles (loc2) */

pconst=nall1*nall2; 

for (i=0;i<nall1;++i){
	for (j=0;j<nall2;++j){
		pprior[i][j]= pconst*x[i]*w[j];
	}
}

/* initialize F from multivariate uniform (could use prior) */

for (i=0;i<nall1;++i){
	for (j=0;j<nall2;++j){
		f[i][j]= 1.0;
	}
}
rdiric(f);

/* initialize density value "z" for MCMC algorithm */

z=ldiricd(f,pprior)+ldiricd2(x,xprior,nall1)+ldiricd2(w,wprior,nall2)+llkhd(f);

/* begin Metropolis-Hastings algorithm */

for (l1=1;l1<=numit;++l1){

/* choose new candidate values */

      /* new f_ij */

	inc1=(int)(WichHill()*nall1);
	dec1=(int)(WichHill()*nall1);
	inc2=(int)(WichHill()*nall2);
	if (inc2==inc1){
		do {
			dec2=(int)(WichHill()*nall2);
		}
		while (dec2==inc2);
	}
	else {
		dec2=(int)(WichHill()*nall2);
	}

	cu1=(f[inc1][inc2]+epsln<f[inc1][inc2]+f[dec1][dec2])?f[inc1][inc2]+epsln:f[inc1][inc2]+f[dec1][dec2];
	if (1.0<cu1){ /* can only happen because of rounding error */
		cu1=1.0;
	}
	cl1=(f[inc1][inc2]-epsln>0)?f[inc1][inc2]-epsln:0;
	for (i=0;i<nall1;++i){
		for (j=0;j<nall2;++j){
			newf[i][j]=f[i][j];
		}

	}
	newf[inc1][inc2]= WichHill()*(cu1-cl1)+cl1;
	newf[dec1][dec2]-= newf[inc1][inc2]-f[inc1][inc2];

	cu2=(newf[inc1][inc2]+epsln<newf[inc1][inc2]+newf[dec1][dec2])?newf[inc1][inc2]+epsln:newf[inc1][inc2]+newf[dec1][dec2];
	if (1.0<cu2){ /* can only happen because of rounding error */
		cu2=1.0;
	}
	cl2=(newf[inc1][inc2]-epsln>0)?newf[inc1][inc2]-epsln:0;

      /* new x, w */

	inc1=(int)(WichHill()*nall1);
	do {
		dec1=(int)(WichHill()*nall1);
	}
	while (inc1==dec1);

	inc2=(int)(WichHill()*nall2);
	do {
		dec2=(int)(WichHill()*nall2);
	}
	while (inc2==dec2);

	b1=(x[inc1]+epsln2<x[inc1]+x[dec1])?x[inc1]+epsln2:x[inc1]+x[dec1];
	if (b1>1.0){ /* can only occur through rounding error */
		b1=1.0;
	}
	a1=(x[inc1]-epsln2>0)?x[inc1]-epsln2:0;
	for (i=0;i<nall1;++i){
		newx[i]=x[i];
	}
	newx[inc1]= (b1-a1)*WichHill()+a1;
	newx[dec1]-= newx[inc1]-x[inc1];

	b3=(w[inc2]+epsln2<w[inc2]+w[dec2])?w[inc2]+epsln2:w[inc2]+w[dec2];
	if (b3>1.0){ /* can only occur through rounding error */
		b3=1.0;
	}
	a3=(w[inc2]-epsln2>0)?w[inc2]-epsln2:0;
	for (j=0;j<nall2;++j){
		neww[j]=w[j];
	}
	neww[inc2]= (b3-a3)*WichHill()+a3;
	neww[dec2]-= neww[inc2]-w[inc2];

	b2=(newx[inc1]+epsln2<newx[inc1]+newx[dec1])?newx[inc1]+epsln2:newx[inc1]+newx[dec1];
	if (b2>1.0){ /* can only occur through rounding error */
		b2=1.0;
	}
	a2=(newx[inc1]-epsln2>0)?newx[inc1]-epsln2:0;

	b4=(neww[inc2]+epsln2<neww[inc2]+neww[dec2])?neww[inc2]+epsln2:neww[inc2]+neww[dec2];
	if (b4>1.0){ /* can only occur through rounding error */
		b4=1.0;
	}
	a4=(neww[inc2]-epsln2>0)?neww[inc2]-epsln2:0;

	/* evaluate density "newz" at candidate values */

	for (i=0;i<nall1;++i){
		for (j=0;j<nall2;++j){
			pprior2[i][j]= pconst*newx[i]*neww[j];
		}
	}

	newz=ldiricd(newf,pprior2)+ldiricd2(newx,xprior,nall1)+ldiricd2(neww,wprior,nall2)+llkhd(newf);

      /* decide whether or not to accept candidate values via Hastings ratio */

	r=newz-z+log(cu1-cl1)-log(cu2-cl2)+log(b3-a3)-log(b4-a4)+log(b1-a1)-log(b2-a2);

	if (log(WichHill())<r){ /* accept */
		z=newz;
		for (i=0;i<nall1;++i){
			x[i]=newx[i];
			for (j=0;j<nall2;++j){
				f[i][j]=newf[i][j];
				pprior[i][j]=pprior2[i][j];
			}
		}
		for (j=0;j<nall2;++j){
			w[j]=neww[j];
		}
		jmp++;
	}

      /* output */

	if (((l1/kp)*kp==l1) && (l1>discard)){ // after burn-in and thinned
            /* allele freqs */
		for (i=0;i<nall1;++i){
			p[i]=0;
		}
		for (j=0;j<nall2;++j){
			q[j]=0;
		}
		for (i=0;i<nall1;++i){
			for (j=0;j<nall2;++j){
				p[i]+= f[i][j];
				q[j]+= f[i][j];
			}
		}

            /* Calculate overall measure of disequilbrium, as given in Hedrick (1987) */

		ddash=0.0;
		for (i=0;i<nall1;++i){
			for (j=0;j<nall2;++j){
				d[i][j]= f[i][j]-p[i]*q[j];
				fprintf(out3,"%10.4f",f[i][j]);
				fprintf(out4,"%10.4f",d[i][j]);
				if (d[i][j]<0){
					dmin[i][j]=(-p[i]*q[j]>-(1-p[i])*(1-q[j]))?-p[i]*q[j]:-(1-p[i])*(1-q[j]);
					ddash2[i][j]=d[i][j]/-dmin[i][j];
				}
				else if (d[i][j]>0){
					dmax[i][j]=(p[i]*(1-q[j])<(1-p[i])*q[j])?p[i]*(1-q[j]):(1-p[i])*q[j];
					ddash2[i][j]=d[i][j]/dmax[i][j];
				}
				fprintf(out6,"%f ",ddash2[i][j]);
				fflush(out6);
				ddash+= p[i]*q[j]*fabs(ddash2[i][j]);
			}
		}
		fprintf(out3,"\n");
		fprintf(out4,"\n");
		fprintf(out6,"\n");
		fflush(out3);
		fflush(out4);
		fflush(out6);

		fprintf(out2,"%f\n",ddash);
		fflush(out2);
		for (i=0;i<nall1;++i){
			fprintf(out5,"%f ",x[i]);
		}
		for (j=0;j<nall2;++j){
			fprintf(out5,"%f ",w[j]);
		}
		fprintf(out5,"\n");
		fflush(out5);
	}

	if ((l1/acrt)*acrt==l1){     /* output acceptance rate */
		fprintf(out1,"%ld %12.6f\n",l1/acrt,((double)(jmp))/l1);
		fflush(out1);
	}
}

fclose(out1);
fclose(out2);
fclose(out3);
fclose(out4);
fclose(out5);
fclose(out6);

return 0;
}

