11-24-2024
The goal here is to run rnaseq
on Seqera, replicating the process described by Emma Strand in https://resilience-biomarkers-for-aquaculture.github.io/ES-RNAseq_with_reference_dataset1/. I used the Seqera input form for the rnaseq
pipeline using the parameters described in that post. For the input sample sheet, I converted the output of https://resilience-biomarkers-for-aquaculture.github.io/SY-Sequera_fetchngs_Cgigas/ to strip quotes and to include only the four columns specified. To do that, I had ChatGPT write a python script.
Upon launching the run, below are the resulting parameters:
{
"help": false,
"rseqc_modules": "bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication",
"gtf_extra_attributes": "gene_name",
"umitools_grouping_method": "directional",
"gtf_group_features": "gene_id",
"validate_params": true,
"bracken_precision": "S",
"ribo_database_manifest": "${projectDir}/workflows/rnaseq/assets/rrna-db-defaults.txt",
"help_full": false,
"custom_config_version": "master",
"aligner": "star_salmon",
"featurecounts_group_type": "gene_biotype",
"igenomes_base": "s3://ngi-igenomes/igenomes/",
"publish_dir_mode": "copy",
"umitools_extract_method": "string",
"show_hidden": false,
"featurecounts_feature_type": "exon",
"min_trimmed_reads": 10000,
"kallisto_quant_fraglen": 200,
"kallisto_quant_fraglen_sd": 200,
"pipelines_testdata_base_path": "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/",
"skip_preseq": true,
"stranded_threshold": 0.8,
"hisat2_build_memory": "200.GB",
"skip_bbsplit": true,
"deseq2_vst": true,
"trimmer": "fastp",
"min_mapped_reads": 5,
"pseudo_aligner_kmer_size": 31,
"max_multiqc_email_size": "25.MB",
"unstranded_threshold": 0.1,
"custom_config_base": "https://raw.githubusercontent.com/nf-core/configs/master",
"gencode": false,
"igenomes_ignore": false,
"remove_ribo_rna": false,
"with_umi": false,
"umitools_dedup_stats": false,
"bam_csi_index": false,
"star_ignore_sjdbgtf": false,
"stringtie_ignore_gtf": false,
"save_merged_fastq": false,
"save_umi_intermeds": false,
"save_non_ribo_reads": false,
"save_bbsplit_reads": false,
"save_reference": false,
"save_trimmed": false,
"save_align_intermeds": false,
"save_unaligned": false,
"save_kraken_assignments": false,
"save_kraken_unassigned": false,
"skip_gtf_filter": false,
"skip_gtf_transcript_filter": false,
"skip_umi_extract": false,
"skip_trimming": false,
"skip_alignment": false,
"skip_pseudo_alignment": true,
"skip_markduplicates": false,
"skip_bigwig": false,
"skip_stringtie": false,
"skip_fastqc": false,
"skip_dupradar": false,
"skip_qualimap": false,
"skip_rseqc": false,
"skip_biotype_qc": false,
"skip_deseq2_qc": false,
"skip_multiqc": false,
"skip_qc": false,
"version": false,
"plaintext_email": false,
"monochrome_logs": false,
"fasta": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/963/853/765/GCF_963853765.1_xbMagGiga1.1/GCF_963853765.1_xbMagGiga1.1_genomic.fna.gz",
"gtf": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/963/853/765/GCF_963853765.1_xbMagGiga1.1/GCF_963853765.1_xbMagGiga1.1_genomic.gtf.gz",
"gff": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/963/853/765/GCF_963853765.1_xbMagGiga1.1/GCF_963853765.1_xbMagGiga1.1_genomic.gff.gz",
"extra_fastp_args": "--cut_mean_quality 30 --trim_front1 10 --trim_front2 10",
"multiqc_title": "rnaseq_Cgigas_ArredondoEspinoza2023",
"input": "s3://steveyost-seqera/Cgigas_ArredondoEspinoza2023/fetchngs/samplesheet/samplesheet_for_rnaseq.csv",
"outdir": "s3://steveyost-seqera/Cgigas_ArredondoEspinoza2023/rnaseq/"
}
It terminated with an error:
Error executing process > 'NFCORE_RNASEQ:RNASEQ:FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW (2)'
Caused by:
Wave container request image cannot start with URL like prefix - offending value: https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0
I tried again, editing my pipeline to use the mamba
config rather than the singularity
config. For the run-specific parameters, I simply copy/pasted the above JSON.
The run succeeded. Wall time 5 h 13 m 9 s, 274.8 CPU hours, estimated cost $6.523.
Folders in AWS S3 bucket folder https://us-east-1.console.aws.amazon.com/s3/buckets/steveyost-seqera?prefix=Cgigas_ArredondoEspinoza2023/rnaseq/
as specified, were created: fastp
, fastqc
, multiqc
, pipleline_info
, and star_salmon
.
Follow up steps to do
To follow up, we will compare results with the original study, and with the results produced by Emma Strand in https://resilience-biomarkers-for-aquaculture.github.io/ES-RNAseq_with_reference_dataset1/.
Python script
Below is the python script generated by ChatGPT to process samplesheet.csv to extract only the four desired columns and to strip quotes.
import csv
def extract_columns(input_csv, output_csv):
# Define the desired columns
desired_columns = ['sample', 'fastq_1', 'fastq_2', 'strandedness']
try:
with open(input_csv, 'r', newline='', encoding='utf-8') as infile:
reader = csv.DictReader(infile)
# Ensure all desired columns exist in the input
if not all(column in reader.fieldnames for column in desired_columns):
raise ValueError("Input CSV file does not contain all required columns.")
with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
writer = csv.DictWriter(outfile, fieldnames=desired_columns)
writer.writeheader()
for row in reader:
# Extract desired columns and remove quotes
cleaned_row = {key: row[key].strip('"').strip("'") for key in desired_columns}
writer.writerow(cleaned_row)
print(f"Extraction complete. Output written to {output_csv}.")
except FileNotFoundError:
print(f"Error: The file {input_csv} does not exist.")
except Exception as e:
print(f"An error occurred: {e}")
# Example usage
input_csv = 'samplesheet.csv'
output_csv = 'samplesheet_for_rnaseq.csv'
extract_columns(input_csv, output_csv)