CORRAL: detecting microbial eukaryotes in metagenomic data

Before starting

Align raw reads to the EukDetect reference using Bowtie2

# paired end data for multiple samples
for READ1 in *R1_001.fastq.gz
do
		READ2=${READ1//1_001.fastq.gz/2_001.fastq.gz}
		SAMPLE=${READ1//_R1_001.fastq.gz}
    bowtie2 --omit-sec-seq --no-discordant --no-unal -x /data/reference_db/eukdetect/ncbi_eukprot_met_arch_markers.fna -k 10,10 -p 104 -1 $READ1 -2 $READ2 -S ${SAMPLE}_.sam
		echo "done aligning reads from $SAMPLE to the eukdetect reference!"
done

Identify eukaryotes based on Markov clustering

# set your filtering options 
# for a more in-depth discussion of these options, see our CORRAL paper
FILTERING_OPTS="--min-read-query-length 60 --min-taxon-num-markers 2 --min-taxon-num-reads 2 --min-taxon-better-marker-cluster-averages-ratio 1.01 --threshold-avg-match-identity-to-call-known-taxon 0.97 --threshold-num-taxa-to-call-unknown-taxon 1 --threshold-num-markers-to-call-unknown-taxon 4 --threshold-num-reads-to-call-unknown-taxon 8"

for SAM in *.sam
do
		SAMPLE=${SAM//.sam}
		READ1=${SAM//.sam/_R1.fastq}
    marker_alignments --input $SAM --output ${SAMPLE}.taxa.tsv --refdb-format eukprot --refdb-marker-to-taxon-path /data/reference_db/eukdetect/busco_taxid_link.txt --output-type taxon_all --num-reads $(grep -c '^@' $READ1) $FILTERING_OPTS
		echo "done profiling euks in $SAMPLE!"
done