Marine Assembly: From Pacbio, Nanopore, 10x Genomics, to HiC Lastz workflow: axt to chain, to netted chains
sticklebackCipher/gatk/Dockerfile
Lines 1 to 18 in db6adda
| FROM ubuntu:20.04 | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| # Install dependencies | |
| RUN apt-get update && apt-get install -y \ | |
| coreutils \ | |
| samtools \ | |
| bwa \ | |
| openjdk-17-jdk \ | |
| openjdk-17-jre \ | |
| fastqc \ | |
| python3 \ | |
| python3-pip \ | |
| && pip3 install --upgrade pip \ | |
| && pip3 install cutadapt \ | |
| && apt-get clean \ | |
| && rm -rf /var/lib/apt/lists/* |
sticklebackCipher/gatk/src/alignReads.bash
Lines 43 to 66 in 8906a84
| trim_galore --output_dir "$OUTPUT_DIR" --basename "$PREFIX" --trim-n --max_n 0 --cores $(( THREADS > 4 ? 4 : THREADS )) --paired "$READ_ONE" "$READ_TWO" | |
| # Define paths for Trim Galore output | |
| TRIMMED_FWD="${OUTPUT_DIR}/${PREFIX}_val_1.fq.gz" | |
| TRIMMED_REV="${OUTPUT_DIR}/${PREFIX}_val_2.fq.gz" | |
| # Adapter trimming and quality filtering with BBDuk | |
| echo "["$(date)"]\tRemoving adapters with BBDuk..." | |
| ./bbduk.sh \ | |
| in1="$TRIMMED_FWD" in2="$TRIMMED_REV" \ | |
| out1="${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" out2="${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" \ | |
| minlen=25 qtrim=rl trimq=10 ktrim=r k=25 mink=11 hdist=1 \ | |
| ref=$ADAPTERS | |
| # MAPPING with bwa and sorting with samtools | |
| echo -e "["$(date)"]\tAligning and sorting..." | |
| BAM="${OUTPUT_DIR}/${PREFIX}.bam" | |
| SORT_THREADS=$(( $THREADS / 2 )) # Adjust based on your system's capabilities | |
| bwa mem -t "$THREADS" "$REFERENCE" "${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" "${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" | \ | |
| samtools sort -@ "$SORT_THREADS" -T "${OUTPUT_DIR}/${PREFIX}.tmp" -o "$BAM" | |
| # Index the final BAM file | |
| samtools index "$BAM" |
sticklebackCipher/gatk/src/gatkfmtBam.bash
Lines 37 to 49 in 8906a84
| gatk --java-options "-Xmx$MEM" MarkDuplicates -I "$BAM" -O /dev/stdout -M /dev/null -VALIDATION_STRINGENCY SILENT -CREATE_INDEX true | \ | |
| samtools view -b -@ "$THREADS" -f 1 - | \ | |
| gatk --java-options "-Xmx$MEM" AddOrReplaceReadGroups -I /dev/stdin -O /dev/stdout -LB "WGS" -PL "$PL" -PU "ILUMINA" -SM "$PREFIX" | \ | |
| gatk --java-options "-Xmx$MEM" FixMateInformation -I /dev/stdin -O "$PROCESSED_BAM" -ADD_MATE_CIGAR true -IGNORE_MISSING_MATES true -TMP_DIR "${PREFIX}.fixmate.tmp" | |
| # Index the final output BAM | |
| samtools index "$PROCESSED_BAM" | |
| # Variant calling | |
| gatk --java-options "-Xmx$MEM" HaplotypeCaller -I "$PROCESSED_BAM" -O "${PREFIX}.g.vcf.gz" -R "$REFERENCE" -ERC GVCF --native-pair-hmm-threads "$THREADS" | |
| # Assuming $gVCF should be ${PREFIX}.g.vcf.gz based on previous command | |
| gatk --java-options "-Xmx$(echo $MEM | sed 's/G/g/')" GenotypeGVCFs -R "$REFERENCE" -V "${PREFIX}.g.vcf.gz" -O "${PREFIX}.vcf.gz" |
sticklebackCipher/gatk/src/mergeGvcf.bash
Lines 27 to 41 in 8906a84
| SAMPLES="${DIR}/${PREFIX}.list" | |
| : > "$SAMPLES" | |
| for i in "$DIR"/*.g.vcf.gz; do | |
| echo "$i" >> "$SAMPLES" | |
| done | |
| MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz" | |
| VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz" | |
| # Combine GVCFs | |
| gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED" | |
| # Genotype GVCFs | |
| gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF" |
sticklebackCipher/gatk/src/snpIndelDiscovery.bash
Lines 18 to 27 in acc009d
| RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz" | |
| FILTERED_SNP="${PREFIX}.filtered.SNPs.vcf.gz" | |
| # Using named pipe for intermediate steps | |
| mkfifo raw.snps.tmp default.snps.tmp frequency.snps.tmp | |
| gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type SNP -O raw.snps.tmp & | |
| gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.snps.tmp --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "default_snp_filter" -O default.snps.tmp & | |
| gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.snps.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.snps.tmp & | |
| gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.snps.tmp --exclude-filtered -O "$FILTERED_SNP" |
sticklebackCipher/gatk/src/baseRecalibratorBQSR.bash
Lines 29 to 50 in acc009d
| # Step 1: Base quality score recalibration | |
| gatk BaseRecalibrator -R "$REFERENCE" \ | |
| --known-sites "$SNP_VCF" \ | |
| --known-sites "$INDEL_VCF" \ | |
| -I "$BAM" \ | |
| -O "$RECALIBRATED_TABLE" | |
| # Step 2: Apply BQSR | |
| gatk ApplyBQSR -R "$REFERENCE" \ | |
| -I "$BAM" \ | |
| --bqsr-recal-file "$RECALIBRATED_TABLE" \ | |
| -O "$BAM_RECALIBRATED" | |
| # Step 3: Index recalibrated BAM | |
| samtools index "$BAM_RECALIBRATED" | |
| # Step 4: Variant calling with HaplotypeCaller | |
| gatk HaplotypeCaller \ | |
| --input "$BAM_RECALIBRATED" \ | |
| --output "$GENOTYPE_VCF" \ | |
| --reference "$REFERENCE" \ | |
| -ERC GVCF |
sticklebackCipher/gatk/src/mergeGvcf.bash
Lines 27 to 41 in acc009d
| SAMPLES="${DIR}/${PREFIX}.list" | |
| : > "$SAMPLES" | |
| for i in "$DIR"/*.g.vcf.gz; do | |
| echo "$i" >> "$SAMPLES" | |
| done | |
| MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz" | |
| VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz" | |
| # Combine GVCFs | |
| gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED" | |
| # Genotype GVCFs | |
| gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF" |
Step 7: The variant recalibration step fits a Gaussian mixture model to the contextual annotations given to each variant
sticklebackCipher/gatk/src/gatkGenotypeVariants.bash
Lines 35 to 62 in acc009d
| # Recalibrate variant quality scores for SNPs | |
| gatk --java-options "-Xmx30g" VariantRecalibrator \ | |
| -R "$REFERENCE" \ | |
| -V "$RAW_SNP" \ | |
| -resource:trainingSnpDb,known=false,training=true,truth=true,prior=8.0 "$SNPS_DB" \ | |
| -an DP -an QD -an FS -an SOR -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff \ | |
| -mode SNP \ | |
| -O "$BQSR_SNP" \ | |
| --tranches-file "$TRACE_FILE" \ | |
| --rscript-file "${BQSR_SNP}.output.plots.R" | |
| # Apply the recalibration to the SNP set | |
| gatk --java-options "-Xmx30g" ApplyVQSR \ | |
| -R "$REFERENCE" \ | |
| -V "$RAW_SNP" \ | |
| -ts-filter-level 99.9 \ | |
| -recal-file "$BQSR_SNP" \ | |
| --tranches-file "$TRACE_FILE" \ | |
| -O "${PREFIX}.vqsr.SNPs.vcf.gz" | |
| # Filter the final set of SNPs | |
| gatk --java-options "-Xmx8g" VariantFiltration \ | |
| -V "${PREFIX}.vqsr.SNPs.vcf.gz" \ | |
| -O "${PREFIX}.vqsr.ase.genotype.SNPs.vcf.gz" \ | |
| --cluster-window-size 35 \ | |
| --cluster-size 3 \ | |
| --filter-name "FS" --filter-expression "FS > 30.0" \ | |
| --filter-name "QD" --filter-expression "QD < 2.0" |