############################################################################
annotation pipeline will be started with the following parameters:
general parameters
  library file= /faststorage/project/PAN_illumina/people/peter/APinput_files/2023-02-17_tPAF_spermatocyte_RNAseq_pooledmRNArRNA.txt - all 9 libraries are available
  type of libraries= RNAseq 
  storage location= /faststorage/project/PAN_illumina/results//pra/dm6/RNAseq/2024-02-22-tPAF_spermatocyte_RNAseq_pooledmRNArRNA_DGE_preprint/
  tmp storage location= /faststorage/project/PAN_illumina/tmp//pra/dm6/RNAseq/2024-02-22-tPAF_spermatocyte_RNAseq_pooledmRNArRNA_DGE_preprint/

read preprocessing forced with flag -f
  adaptors for clipping= standard Illumina sequences will be used for adaptor-clipping fw=AGATCGGAAGAGCACACGTCT rv=AGATCGGAAGAGCGTCGTGTA
  trimming of random nucleotides= OFF
  generation of paired-fasta file= OFF
  
processing parameters
  genome assembly version= dm6
  genome annotation version= r6.40
  Y-chromosome= included in analysis
  length filtering= 18 - 1000
  trimming= Yes 1 - 1000
  MM allowed in mapping= 2
  track normalization= track normalization to 10M uniquely mapping reads
  read extension= 0
  filtered read classes= rRNA:tRNA:mito
  color for track= 0,128,0 

############################################################################

start command:
/faststorage/project/PAN_illumina/backup/scripts/AnnotationPipeline/annotate_reads.sh  -i /faststorage/project/PAN_illumina/people/peter/APinput_files/2023-02-17_tPAF_spermatocyte_RNAseq_pooledmRNArRNA.txt -Yx -F tPAF_spermatocyte_RNAseq_pooledmRNArRNA_DGE_preprint -t RNAseq -v dm6 -V r6.40 -N 0 -m 18 -M 1000 -f 1 -l 1000 -s 2 -e 0 -J 1000000 

Version= 4.0

Release Branch CommitID= 

############################################################################

libraries analyzed in this run:
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216633_mRNA_216642_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep2.fastq.gz B_216633_mRNA_216642_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216637_mRNA_216646_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep3.fastq.gz C_216637_mRNA_216646_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216635_mRNA_216644_rRNA_spermatocyte_GFP_Cdc73L_rep3.fastq.gz A_216635_mRNA_216644_rRNA_spermatocyte_GFP_Cdc73L_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216630_mRNA_216639_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep1.fastq.gz B_216630_mRNA_216639_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep1
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216632_mRNA_216641_rRNA_spermatocyte_GFP_Cdc73L_rep2.fastq.gz A_216632_mRNA_216641_rRNA_spermatocyte_GFP_Cdc73L_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216631_mRNA_216640_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep1.fastq.gz C_216631_mRNA_216640_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep1
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216634_mRNA_216643_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep2.fastq.gz C_216634_mRNA_216643_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216636_mRNA_216645_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep3.fastq.gz B_216636_mRNA_216645_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216629_mRNA_216638_rRNA_spermatocyte_GFP_Cdc73L_rep1.fastq.gz A_216629_mRNA_216638_rRNA_spermatocyte_GFP_Cdc73L_rep1


all 9 libraries are available

############################################################################

libraries as supplied by the user:
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216637_mRNA_216646_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep3.fastq.gz	C_216637_mRNA_216646_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216634_mRNA_216643_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep2.fastq.gz	C_216634_mRNA_216643_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/C_216631_mRNA_216640_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep1.fastq.gz	C_216631_mRNA_216640_rRNA_spermatocyte_GFP_Cdc73L_CG12674_A2_7_rep1
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216636_mRNA_216645_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep3.fastq.gz	B_216636_mRNA_216645_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216633_mRNA_216642_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep2.fastq.gz	B_216633_mRNA_216642_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/B_216630_mRNA_216639_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep1.fastq.gz	B_216630_mRNA_216639_rRNA_spermatocyte_GFP_Cdc73L_CG10887_A12_3_rep1
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216635_mRNA_216644_rRNA_spermatocyte_GFP_Cdc73L_rep3.fastq.gz	A_216635_mRNA_216644_rRNA_spermatocyte_GFP_Cdc73L_rep3
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216632_mRNA_216641_rRNA_spermatocyte_GFP_Cdc73L_rep2.fastq.gz	A_216632_mRNA_216641_rRNA_spermatocyte_GFP_Cdc73L_rep2
/faststorage/project/PAN_illumina/tmp/HL5JGDRX2_0_R14703_20221220/A_216629_mRNA_216638_rRNA_spermatocyte_GFP_Cdc73L_rep1.fastq.gz	A_216629_mRNA_216638_rRNA_spermatocyte_GFP_Cdc73L_rep1


############################################################################


help file of used version:

  usage: /faststorage/project/PAN_illumina/backup/scripts/AnnotationPipeline/annotate_reads.sh options
  
  ###############################################################################
  This scripts analyses deep-sequencing libraries and generates
  several statistics and UCSC compatiple tracks.
  It can also convert raw-bam files located at NGS into adaptor clipped and N trimmed
  fasta files. 
  
  Version= 4.0
  
  Do not start the script in the background by adding "&" to the start command.

  
  to attach the screen again use the command screen -r
  
  A detailed description of the functionality and usage can be found in:
  "XX-SCRIPT_DIR-XX"README.md
      
  OPTIONS:
      -h  Show this message
    
  OBLIGATORY [but only one of the two options at any time]
      -i  File containing the information fo all the libraries
            Format:  Path-to-LIBRARY tab Name-for-files (optional: tab IP) newline
              folowing formats of Path-to-LIBRARY are possible:
                - specify full path - use this for libraries from our SeqArchive or for 
                    a random library not available from SRA,NGS
                - SRA library (specify as SRR###)
                - NGS library as URL (specify URL you get from the NGS portal)
                    you have to supply the barcode in the format BC=###### in column >=3
                - NGS library via NGS ID (spcify NGS ID in the format NGS######) 
                    in case multiple runs are available for a sample you will be asked 
                    to select the ones you like to analyze/merge. 
                    the barcode will be extracted automatically                    

            allowed input-file formats: bam; fa; fq; fa.gz; fq.gz
            merging of files is possible for local-bam, NGS-URL as semicolon [;] separated list
            NGS-ID selection during startup of script (only supply ID)
            
            The color for the UCSC track can be defined for each library individually by adding 
            COLOR=255,255,255 to the line. Please only use valid RGB codes as there is no check.
            For libraries without COLOR tag the default color will be used
            
      -u  update old AnnotationPipeline run to newest version [give path to folder]
            this function is mainly present to keep old UCSC sessions alife and up to date

  OPTIONAL - available in interactive mode [if not set, the script will ask the user to set these fields]
      -F  Name attachment for the  output FOLDER results get deposited 
            Suggestion is a tag to easily identify the experiment/LIBRARY contained in the run
            Data can be accesed at: [will be visible in the helpfile of the installation]

      -t  TYPE of LIBRARY [RNAseq, sRNAseq, sRNAseqIP, GROseq, CLIPseq, CHIPseq, DNAseq, RNAseq_QuantSeq, CapSeq, RIPseq]
            if both sRNAseq and sRNAseqIP libraries are present in -i
            then use sRNAseq and add a third column containing the tag "IP" 
            to the file specified in -i, but only for the IP libraries
            format:  full_path_of_LIBRARY "tab" name_you_would_like_to_have_for_LIBRARY "tab" IP
            if TYPE=sRNAseq tracks get normalized to 1M miRNAs others see default values in README.md
      -j  set flag if libraries are containing 4SU SLAMseq like T-->C conversions

      -V  VERSION of Genome annotation 
            [format for reference genomes: "r#.##" 
             format for assemblies: "LINE.VERSION" - e.g. OSC.v19]
            if option is left unset, then you will be prompted which annotation to use
            and if it does not exists, the generation of a new one will be started 
      -D  for TYPE=RNAseq perform differential gene expression (DGE) analysis
            if set, you will be asked to pair libraries based on conditions
            requires to have at least 2 replicates for the genotypes selected for analysis
            not all libraries in the run have to be added for DGE analysis
            
  OPTIONAL [these settings have to be set as option in the start-command, non of these is required if default
        settings are used]

    bam -> fasta 
      -x  set flag to force pre-processing even if input format is not bam
      -N  trimming of N random nucleotides from each end [0-8]
            specify how many random nucleotides are present on the read after adaptor-clipping
      -P  report paired-file [only applicable if starting from bam files] [no input required]
            in addition to the annotated fasta file a 2nd fasta file containing the raw and paired reads is
            generated - the sequences in this fasta-file got adaptor clipped and N-trimmed if applicable
      -p  report only paired end file
            automatically activates -P and -Q
      -Q  report fastq-file after adaptor trimming and N clipping[only applicable if starting from NGS or fastq files] [no input required]
            in addition to the standard output a adaptor-clipped and random-nucleotide trimmed
            fastq file is copied to the output directory
      -q  report raw fastq-file that only got demultiplexed [only applicable if starting from NGS or fastq files] [no input required]
            in addition to the standard output a raw fastq file is copied to the output directory
      -A  use non standard adaptor sequences for adaptor clipping [no input required]
            if flag is set, the script will ask you to supply custom adaptor sequences
      -r  set flag to only generate demultiplexed pre-processed files
          per default this option only produces fasta - for fastq add Q and/or q
            the following steps are executed
              -demultiplexing
              -adaptor clipping
              -N trimming
              -fasta-file transferred to open directory
      -R  subsample reads to analyze a fraction only
            supply numeric value
            can be used per saple by the addition of SUB=#### into the sample file

    read-preprocessing        
      -m  minimal length of small RNA after TRIMMing [defaults: sRNAseq="18" | RNAseq/GROseq/CHIPseq-default="18"]
      -M  maximal length of small RNA [defaults: sRNAseq="35" | RNAseq/GROseq/CHIPseq-default="200"]
      -T  TRIMM reads [no input required]
           set flag to TRIMM reads
            [RNAseq & GROseq trimmed to bases 5-50 by default; 
             can be overwritten by supplying your own trimm settings]
      -f  FIRST base for TRIMMing - give the FIRST base to keep
      -l  LAST base for TRIMMing - give the LAST base to keep 
      -a  set flag to remove polyA stretches from the end of the reads 
            e.g. to remove PolyA containing reads originating from the end of the transcripts
      -I  set flag to invert reads (can be useful for GROseq (old libraries cloned by ligation))
      -S  set flag to force single-end processing and to delete a potential 2nd mate-read from 
            paired end sequencing
      -2  set flat to force processing of only the 2nd mate of paired end data

      -J  max sequence count - ignore counts beyond this number for each sequenced sequence  
            e.g. -J 1 only analyzes each sequnce and ignores the counts completely
    
    mapping & analysis 
      -v  Genome Version ["dm3" or "dm6" for reference genome; ASM for assemblies [default:"dm6"]]
      -Y  set flag to include the Y-chromosome in the analysis 
            [default: Y-chromosome completely excluded from analysis]
      -s  mismatches allowed for mapping against the genome [0-3; default: "0" exept for CLIPseq=3]
            mapping against the rRNA precursor and the mitochondrial genome is always
            performed with 3 MM allowed.
      -E  distribute multimappers evenly instead of reporting all positions
            not well tested - probably only makes sense for tracks 
      -W  wig tracks normalization to 10M uniquely mapping reads [no input required]
            [default for RNAseq, CLIPseq, GROseq]
            not compatible with option -w     
      -w   wig tracks normalization to 10M reads in the input fasta-file [no input required]
           [default for sRNAseqIP, CHIPseq, DNAseq]
            not compatible with option -W     
      -L  wig track normalization based on synthetic spike-ins
            can be activated for individual libraries by adding "spikeINnorm" to the input file
      -z   set flag to turn off normalization completely. 
            raw-read-counts will be reported in the graphs
      -a   set flag to set wig tracks to auto scale in UCSC
      -5   set flat to create wig tracks using the 5' end only 
            no full read-length coverage will be computed
      -n  run analysis in unstranded mode
            does change output for all stranded analysis modes to report unstranded outputs
            affects tracks, annotation-counts
      -e  extend read to fragment length (only for CHIPseq) [#]
      -b  export collapsed bam for mapped reads (uniquely and all mapper files)
      -B  export uncollapsed bam for uniquely mapped reads
      -c   overwrite the default track coloring [format: R,G,B] 
            if not set, tracks will get colored dependend on the TYPE of the library
      -X  set option to re-import libraries from NGS 
            only use if you really want to regenerate the de-multiplexed bam file
            in the storage 
      -G  create export for GEO submission 
            !!!! only possible for samples submitted in the NGS##### format !!!!
            output-files:
              raw-bam files 
              h5 values for the raw-files
              processed-files
                sRNAseq = read-count table
                RNAseq = wig-tracks + sleuth-quantification (if DGE was performed)
                CHIPseq = wig-tracks
      -U  run script under a different username
            use if you are running the script for someone else
            structure [format: "handler"] 
      -Z  import data from official NGS server
      -g  also download listing from the Ameres group
            will only work if your NGS account is linked to the Ameres group
      
            
############################################################################

total time =  68.8333 min