**** TODO Reproduire les performances precisionchallenge
***** TODO 0GOOR
****** vcfeval : echec
#+begin_src sh
cd /Work/Groups/bisonex/data/NA12878/precisionChallenge
curl -O https://data.nist.gov/od/ds/ark:/88434/mds2-2336/submission_vcfs/0GOOR/0GOOR_HG002.vcf.gz
bcftools annotate --rename-chrs /Work/Groups/bisonex/data/genome/GRCh38.p13/chromosome_mapping.txt 0GOOR_HG002.vcf.gz -o 0GOOR_HG002_renamed.vcf.gz
tabix 0GOOR_HG002_renamed.vcf.gz
#+end_src
Soumission
#+begin_src slurm
#!/bin/bash
#SBATCH -c 4
#SBATCH -p smp
#SBATCH --time=01:00:00
#SBATCH --mem=32G
module load nix/2.11.0
export HGREF=/Work/Groups/bisonex/data-alexis-reference/genome/GRCh38_latest_genomic.fna
dir=/Work/Groups/bisonex/data/NA12878/
rtg vcfeval -b $dir/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
--bed-regions ${dir}/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed \
-c ${dir}/precisionChallenge/0GOOR_HG002_renamed.vcf.gz \
-o test-0GOOR -t /Work/Groups/bisonex/data/genome/GRCh38.p13/genomeRef.sdf
#rtg vcfeval -b $dir/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz --bed-regions ${dir}/benchmark-exons.bed -c files/vcf/NA12878_NIST7035_vep_annot.vcf.gz -o test-rtg -t /Work/Groups/bisonex/data/genome/GRCh38.p13/genomeRef.sdf
#+end_src
Résultat :
VCF header does not contain a FORMAT field named GQ
There were 112 problematic called variants skipped during loading (see vcfeval.log for details).
There were 2303523 variants not thresholded in ROC data files due to missing or invalid GQ (FORMAT) values.
Could not select maximized F-measure threshold from ROC data, only un-thresholded statistics will be shown. Consider selecting a different scoring attribute with --vcf-score-field
Threshold True-pos-baseline True-pos-call False-pos False-neg Precision Sensitivity F-measure
----------------------------------------------------------------------------------------------------
None 938101 934445 1369078 1276425 0.4057 0.4236 0.4144
****** hap.py sans vcfeval (utilisé pour le challenge) : echec
#+begin_src slurm
#!/bin/bash
#SBATCH -c 4
#SBATCH -p smp
#SBATCH --time=01:00:00
#SBATCH --mem=32G
module load nix/2.11.0
export HGREF=/Work/Groups/bisonex/data-alexis-reference/genome/GRCh38_latest_genomic.fna
dir=/Work/Groups/bisonex/data/NA12878/
hap.py $dir/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
${dir}/precisionChallenge/0GOOR_HG002_renamed.vcf.gz \
-f ${dir}/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed \
-o test
#+end_src
Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score
INDEL ALL 276768 102437 174331 1156702 211537 842303 53768 58006 0.370119 0.327170 0.728194 0.347322
INDEL PASS 276768 102437 174331 1156702 211537 842303 53768 58006 0.370119 0.327170 0.728194 0.347322
SNP ALL 1937706 835470 1102236 5666020 1160590 3669981 437793 21058 0.431164 0.418553 0.647718 0.424765
SNP PASS 1937706 835470 1102236 5666020 1160590 3669981 437793 21058 0.431164 0.418553 0.647718 0.424765
On est censé avoir : https://data.nist.gov/od/ds/ark:/88434/mds2-2336/benchmarking_results/0GOOR/0GOOR_HG002.extended.csv
Type Filter METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score
INDEL ALL 0.935776 0.903997 0.523369 0.919612
INDEL PASS 0.935776 0.903997 0.523369 0.919612
SNP ALL 0.998554 0.99395 0.403126 0.996247
SNP PASS 0.998554 0.99395 0.403126 0.996247
****** Avec chr originels
#+begin_src
awk '{
if($0 !~ /^#/)
print "chr"$0;
else if(match($0,/(##contig=<ID=)(.*)/,m))
print m[1]"chr"m[2];
else print $0
}' 0GOOR_HG002.vcf | gzip -c > 0GOOR_HG002_chr.vcf.gz
#+end_src
****** Avec même génome de référence
***** TODO réinstaller hap.py
****** python 2
****** vceval