#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>

/*
     	Program "ldhap.c"         (09/08/00)
	
	Program for sampling from the posterior probability distribution
	for two-locus haplotype frequencies for each pair of loci in a
	multi-locus dataset (data consists of haplotype counts). This
	program is ONLY for biallelic loci, and outputs posterior
	summary statistics (median, percentiles) for two disequilibrium
	measures (D', R^2) by sampling INDEPENDENTLY for each pair of
	loci - a sample is first drawn from the posterior for haplotype
	frequencies f_ij: from these, a sample from the posterior for
	the disequilibrium coefficient can be obtained.

	The method is outlined in Ayres & Balding (2000), Genetics 157.

	The Bayesian approach requires a prior for f_ij, and likelihood.
	We implement a Dirichlet prior for f_ij, and a multinomial likelihood
   	for haplotype frequencies f_ij. The posterior for f_ij is therefore
	Dirichlet with parameters = (prior parameters+observed haplotype
	counts), and a sample can be generated directly from a standard
	method for sampling from this distribution.

	The format of the input file should be locus label (real number)
	followed by series of zeros and ones representing the alleles
	for each haplotype in the sample (i.e. data file should be a
	matrix L rows by N+1 columns where L=number of loci (or segregating
	sites for sequence data), N=number of haplotypes in the sample,
	e.g.

	0.345 0 0 1 0 0 1 1 1 0 0
      0.463 0 1 1 1 0 0 0 1 1 0
      0.564 1 1 1 1 0 0 0 0 0 1

	The format of the output file is: locus1 label, locus2 label,
	followed by median(D'), 5th percentile(D'), 95th percentile(D'),
	median(R^2), 5th percentile(R^2), 95th percentile(R^2).

	Values that need to specified by the user (listed below as
	globals) are:

	maxloc  - maximum number of loci (or segregating sites) allowed 
	num     - sample size (number of haplotypes in the sample)
		    NOTE THAT THE SAMPLE SIZE MUST BE SPECIFIED EXACTLY HERE
 	npost   - number of values to generate from the posterior 
*/

/* GLOBALS */

#define maxloc 20   /* maximum number of loci allowed */
#define num 92       /* sample size */
#define npost 1000 /* number of values to generate from posterior */

int i, j, nmuts, nn, k, datmat[maxloc][num], cnt[2][2];
double seglb[maxloc], pmed[2], p5[2], p95[2], prior[2][2];
double  gsum, fpost[2][2], dpvec[npost], rsvec[npost];
double  par[2][2], di, tmp, p, q,  ff;
long ix, iy, iz;
FILE *inp, *out1;

double WichHill()
{
/* Wichmann & Hill random number generator (Algorithm AS183 in Applied
   Statistics 31 (1982))
*/

ix=(171*ix)%30269;
iy=(172*iy)%30307;
iz=(170*iz)%30323;
return fmod((ix/30269.0)+(iy/30307.0)+(iz/30323.0),1.0);
}

void inpdat()
{
/* read in data */

nmuts=0;
while ((nn=fscanf(inp,"%lf",&ff))!=EOF){
seglb[nmuts]=ff; /* site (locus) label */
	for (i=0;i<num;++i){
		fscanf(inp,"%d",&datmat[nmuts][i]);
	}
	nmuts++;
	if (nmuts>maxloc){printf("ERROR - increase maxloc\n");exit(1);}
}
fclose(inp);
}

double rstgam(double alpha1)
{
/* Cheng & Feast algorithm (algorithm 3.20 in Ripley 1988) for
   generating a standard Gamma random variable with shape
   parameter alpha>1 */

int     acpt;
double  c1, c2, c3, c4, c5, u1, u2, w;

c1=alpha1-1.0;
c2=(alpha1-(1.0/(6.0*alpha1)))/c1;
c3=2.0/c1;
c4=c3+2.0;
c5=1.0/sqrt(alpha1);
do{
	do{
		u1=WichHill();
		u2=WichHill();
		if (alpha1>2.5){
			u1=u2+c5*(1.0-(1.86*u1));
		}
	}
	while (u1>=1.0 || u1<=0);

	w=(c2*u2)/u1;
	if ((c3*u1+w+(1.0/w))<=c4){
		acpt=1;
	}
	else if ((c3*log(u1)-log(w)+w)>=1.0){ /* repeat outer "do" loop */
		acpt=0;
	}
	else{
		acpt=1;
	}
	}
while (acpt!=1);

return c1*w;
}

void postsim()
{
/* posterior simulation assuming Dirichlet prior and multinomial likelihood
   for haplotype frequencies f_ij and calculates posterior summary stats for 
   output (median, percentiles) of two disequilibrium measures    */

int     i, j, np;

for (np=0;np<npost;++np){
	gsum=0.0;
	for (i=0;i<2;++i){
		for (j=0;j<2;++j){ 
			par[i][j]=cnt[i][j]+prior[i][j];
			if (par[i][j]>1.0){
				fpost[i][j]=rstgam(par[i][j]);
			}
			else if (par[i][j]<1.0){   /* Ex. 3.22 in Ripley(1988) for alpha<1 */
				fpost[i][j]=rstgam(par[i][j]+1.0)*pow(WichHill(),1.0/par[i][j]);
			}
			else {    /* Exponential */
				fpost[i][j]= (-log(WichHill()));
			}
			gsum+= fpost[i][j];
		}
	}
	for (i=0;i<2;++i){
		for (j=0;j<2;++j){
			fpost[i][j]=fpost[i][j]/gsum;
		}
	}
	p=fpost[1][0]+fpost[1][1];
	q=fpost[0][1]+fpost[1][1];

	/* D */

	di=fpost[1][1] - p*q;

	/* R^2 */

	rsvec[np]=(di*di)/(p*(1.0-p)*q*(1.0-q));

	/* D' */

	if (di>0){dpvec[np]=di/((p*(1-q)<(1-p)*q)?p*(1-q):(1-p)*q);}
	else if (di<0){dpvec[np]=di/((p*q<(1-p)*(1-q))?p*q:(1-p)*(1-q));dpvec[np]=fabs(dpvec[np]);}
	else {dpvec[np]=0.0;}
}

/* order values */

for (np=0;np<npost;++np){
	for (i=np+1;i<npost;++i){
		if (dpvec[np]>dpvec[i]){
			tmp=dpvec[np];
			dpvec[np]=dpvec[i];
			dpvec[i]=tmp;
		}
		if (rsvec[np]>rsvec[i]){
			tmp=rsvec[np];
			rsvec[np]=rsvec[i];
			rsvec[i]=tmp;
		}
	}
}

pmed[0]=dpvec[npost/2];
pmed[1]=rsvec[npost/2];

p5[0]=dpvec[(int)(npost*0.05)];
p95[0]=dpvec[(int)(npost*0.95)];
p5[1]=rsvec[(int)(npost*0.05)];
p95[1]=rsvec[(int)(npost*0.95)];

fprintf(out1,"%f    %f  %f  ",pmed[0],p5[0],p95[0]);
fprintf(out1,"%f    %f  %f\n",pmed[1],p5[1],p95[1]);
}

main()
{
/* set seeds for random number generator */
srand((unsigned) time(NULL));
ix=(long)(rand()%30000)+1;
iy=(long)(rand()%30000)+1;
iz=(long)(rand()%30000)+1;

/* open input/output files */

inp=fopen("ldhap.inp","r");
if (!inp){printf("inp NOT OPENED\n");exit(1);}

out1=fopen("ldhap.out","w");
if (!out1){printf("out1 NOT OPENED\n");exit(1);}

/* Read in data */

inpdat();

/* Set prior probabilities for haplotype freqs (=1.0 implements multivariate uniform) */

for (i=0;i<2;++i){
	for (j=0;j<2;++j){
		prior[i][j]=1.0;
	}
}

/* Do posterior simulation independently for all pairs of loci */
for (i=0;i<nmuts;++i){
	for (j=i+1;j<nmuts;++j){
		/* calculate haplotype counts (global array) for locus i and j */
		cnt[0][0]=0;
		cnt[0][1]=0;
		cnt[1][0]=0;
		cnt[1][1]=0;

		for (nn=0;nn<num;++nn){
			 if (datmat[i][nn]>=0 && datmat[j][nn]>=0){cnt[datmat[i][nn]][datmat[j][nn]]++;}
		}
	 	fprintf(out1,"%f %f   ",seglb[i],seglb[j]); /* output locus labels */
		postsim(); /* do posterior sampling */
	}
}

fclose(out1);
return 0;
}
