CORRAL: detecting microbial eukaryotes in metagenomic data

Before starting
Align raw reads to the EukDetect reference using Bowtie2
Identify eukaryotes based on Markov clustering

Before starting

Align raw reads to the EukDetect reference using Bowtie2

# paired end data for multiple samples
for READ1 in *R1_001.fastq.gz
do
		READ2=${READ1//1_001.fastq.gz/2_001.fastq.gz}
		SAMPLE=${READ1//_R1_001.fastq.gz}
    bowtie2 --omit-sec-seq --no-discordant --no-unal -x /data/reference_db/eukdetect/ncbi_eukprot_met_arch_markers.fna -k 10,10 -p 104 -1 $READ1 -2 $READ2 -S ${SAMPLE}_.sam
		echo "done aligning reads from $SAMPLE to the eukdetect reference!"
done

Identify eukaryotes based on Markov clustering

# set your filtering options 
# for a more in-depth discussion of these options, see our CORRAL paper
FILTERING_OPTS="--min-read-query-length 60 --min-taxon-num-markers 2 --min-taxon-num-reads 2 --min-taxon-better-marker-cluster-averages-ratio 1.01 --threshold-avg-match-identity-to-call-known-taxon 0.97 --threshold-num-taxa-to-call-unknown-taxon 1 --threshold-num-markers-to-call-unknown-taxon 4 --threshold-num-reads-to-call-unknown-taxon 8"

for SAM in *.sam
do
		SAMPLE=${SAM//.sam}
		READ1=${SAM//.sam/_R1.fastq}
    marker_alignments --input $SAM --output ${SAMPLE}.taxa.tsv --refdb-format eukprot --refdb-marker-to-taxon-path /data/reference_db/eukdetect/busco_taxid_link.txt --output-type taxon_all --num-reads $(grep -c '^@' $READ1) $FILTERING_OPTS
		echo "done profiling euks in $SAMPLE!"
done