To begin with, you need to setup a few variables:
hpr="%DIRECTORY WITH HAPI-UR BINARY FILES%" gmp="%DIRECTORY WITH GENETIC MAP FILES%" bgl="%DIRECTORY WITH BEAGLE BINARY FILE%" dsh="%DIRECTORY WITH DASH BINARY FILE%" set="%PLINK DATASET PREFIX%"
And have a couple of programs pre-installed on your Linux machine:
# install plink sudo apt-get install plink # install hapi-ur wget https://hapi-ur.googlecode.com/files/hapi-ur-1.01.tgz tar xzvf hapi-ur-1.01.tgz mkdir -p $hpr /bin/cp hapi-ur-1.01/hapi-ur hapi-ur-1.01/insert-map.pl $hpr/ # install genetic maps wget http://mathgen.stats.ox.ac.uk/impute/genetic_maps_b37.tgz tar xzvf genetic_maps_b37.tgz mkdir -p $gmp for i in {1..22} X_PAR1 X_nonPAR X_PAR2; do gzip -c genetic_maps_b37/genetic_map_chr${i}_combined_b37.txt > $gmp/chr$i.gmap.gz; done # install beagle mkdir -p $bgl wget http://faculty.washington.edu/browning/beagle/b4.r1196.jar -P $bgl/ # install dash wget http://www1.cs.columbia.edu/~gusev/dash/dash-1-1.tar.gz tar xzvf dash-1-1.tar.gz mkdir -p $dsh /bin/cp dash-1-1/bin/dash_cc $dsh/
The following pipeline will run the analysis independently on each autosome:
# perform IBD and DASH analysis separately for each autosome for i in {1..22}; do # create a directory mkdir -p chr$i # extract autosome of interest p-link --noweb --bfile $set --chr $i --make-bed --out chr$i/$set # generate marker file with genetic distance zcat $gmp/chr$i.gmap.gz | $hpr/insert-map.pl chr$i/$set.bim - | awk -v OFS="\t" \ 'length($5)>1 || $6=="-" {$5="I"; $6="D"} $5=="-" || length($6)>1 {$5="D"; $6="I"} {print $1,$2,$3,$4,$5,$6}' > chr$i/$set.gmap.bim # clean up just in case /bin/rm chr$i/$set.sample chr$i/$set.haps # phase haplotypes using hapi-ur $hpr/hapi-ur -g chr$i/$set.bed -s chr$i/$set.gmap.bim -i chr$i/$set.fam -w 73 --impute -o chr$i/$set # convert hapi-ur output for use with beagle (echo "##fileformat=VCFv4.1"; echo "##FORMAT="; echo -en "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"; awk 'NR>2 {printf $1"."$2"\t"}' chr$i/$set.sample | sed 's/\t$/\n/g'; awk -v c=$i '{printf c"\t"$3"\t"$2"\t"$4"\t"$5"\t.\tPASS\t.\tGT"; for (i=6; i<NF; i+=2) printf "\t"$i"|"$(i+1); printf "\n"}' chr$i/$set.haps) | java -Xmx3g -jar $bgl/b4.r1185.jar ibd=true gt=/dev/stdin usephase=true burnin-its=0 phase-its=0 out=chr$i/$set # convert beagle output for use with DASH awk '{sub("\\\."," ",$1); sub("\\\."," ",$3); print $1"."$2-1,$3"."$4-1,$6,$7}' chr$i/$set.hbd chr$i/$set.ibd | $dsh/dash_cc chr$i/$set.fam chr$i/$set # create map file with IBD clusters cut -f1-5 chr$i/$set.clst | awk -v c=$i '{ print c,c"_"$1,$2,$3,$4,$5 }' > chr$i/$set.cmap # create map file with IBD clusters for use with plink awk -v c=$i '{ print c,$2,0,int(($3+$4)/2) }' chr$i/$set.cmap > chr$i/$set.map # convert output to plink binary format p-link --maf 0.001 --file chr$i/$set --make-bed --recode --out chr$i/$set done
You can then join the results in a single plink binary file using the code discussed in a previous post:
# join separate autosome outputs into a single plink binary file for i in {1..22}; do cat chr$i/$set.cmap; done > $set.dash.cmap cp $set.fam $set.dash.fam for i in {1..22}; do cat chr$i/$set.bim; done > $set.dash.bim (echo -en "\x6C\x1B\x01"; for i in {1..22}; do tail -c +4 chr$i/$set.bed; done) > $set.dash.bed