First we need to setup a few variables:
gct="%DIRECTORY WITH GCTA%" vcf="%DIRECTORY WITH KGP VCF FILES%"
And then we can download all necessary files:
# install plink and utilities to convert to plink format sudo apt-get install plink wget http://broadinstitute.org/~giulio/vcf2plink/vcf2tfam.sh wget http://broadinstitute.org/~giulio/vcf2plink/vcf2tped.sh chmod a+x vcf2tfam.sh vcf2tped.sh
# download the 1000 Genomes files pfx="http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase1_vcf/chr" sfx=".1kg.ref.phase1_release_v3.20101123.vcf.gz" for chr in {1..22} X; do wget $pfx$chr$sfx $pfx$chr$sfx.tbi -P $vcf/; done
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped # install GCTA wget http://www.complextraitgenomics.com/software/gcta/gcta_1.22.zip mkdir -p $gct unzip gcta_1.22.zip gcta64 -d $gct/
For each chromosome we now need to create a plink binary file:
# convert each VCF file to a plink binary file with sex information included pfx="$vcf/chr" for chr in {1..22} X; do tabix -H $pfx$chr$sfx | ./vcf2tfam.sh /dev/stdin > kgp.chr$chr.tfam tabix $pfx$chr$sfx $chr | ./vcf2tped.sh /dev/stdin > kgp.chr$chr.tped p-link --noweb --tfile kgp.chr$chr --make-bed --out kgp.chr$chr /bin/rm kgp.chr$chr.tfam kgp.chr$chr.tped kgp.chr$chr.nosex kgp.chr$chr.log awk 'NR==FNR {sex[$2]=$5} NR>FNR {$5=sex[$2]; print}' 20130606_g1k.ped kgp.chr$chr.fam > kgp.chr$chr.fam.tmp && /bin/mv kgp.chr$chr.fam.tmp kgp.chr$chr.fam done
At this point we can use plink to prune each chromosome and then join the individual datasets into a single large dataset as discussed in a previous post:
# prune each chromosome for chr in {1..22} X; do p-link --noweb --bfile kgp.chr$chr --maf .01 --indep 50 5 2 --out kgp.chr$chr p-link --noweb --bfile kgp.chr$chr --extract kgp.chr$chr.prune.in --make-bed --out kgp.chr$chr.prune done # join pruned datasets into single dataset cat kgp.prune.chr1.fam > kgp.prune.fam for chr in {1..22} X; do cat kgp.chr$chr.prune.bim; done > kgp.prune.bim (echo -en "\x6C\x1B\x01"; for chr in {1..22} X; do tail -c +4 kgp.chr$chr.prune.bed; done) > kgp.prune.bed
We can now run GCTA to compute the genetic relationship matrix and then the main principal components:
# perform PCA analysis of the whole dataset $gct/gcta64 --bfile kgp.prune --autosome --make-grm-bin --out kgp --thread-num 10 $gct/gcta64 --grm-bin kgp --pca 20 --out kgp --thread-num 10 # create PCA matrix to be loaded into R (echo "FID IID "`seq -f PCA%.0f -s" " 1 20`; (echo -n "#eigvals: "; head -n20 kgp.eigenval | tr '\n' ' '; echo; cat kgp.eigenvec) | awk 'NR==1 {n=NF-1; for (i=1; i<=n; i++) eval[i]=$(i+1)} NR>1 {NF=2+n; for (i=1; i<=n; i++) $(i+2)*=sqrt(eval[i]); print}') > kgp.pca
This will generate a table that can be easily loaded into R for further analyses.
No comments:
Post a Comment