add marker database script

200f523d · Dr. Mohamed Mohamed Hefny Salim · 43add9e9 · 200f523d
Commit 200f523d authored 4 years ago by Dr. Mohamed Mohamed Hefny Salim
--- a/STOMACH_ANALYSIS/marker_database.sh
+++ b/STOMACH_ANALYSIS/marker_database.sh
+#!/bin/bash
+################################################################################
+#                         CustomMarkerDatabase                                 #
+#                                                                              #
+# Script for building BLAST database input file of all COI and rbcl sequences  #
+# in GenBank                                                                   #
+#                                                                              #
+# description: xxxxxxxxxxxxxxxxxxxxxxxx                                        #
+#                                                                              #
+#  Copyright (C) 2020 Rahma Amen (Potsdam University)                          #
+#  xxxxxxxxxxx@xxxxxxxx                                                        #
+#                                                                              #
+# Change History                                                               #
+#   16/07/2020  Rahma Amen  created the script (inspired from Johanna Krger)  #
+#                                                                              #
+#                                                                              #
+################################################################################
+####### create custom marker database #####
+# Script for building BLAST database input file of all COI and rbcl sequences in GenBank
+# use < this results in redundant hits (removed in later steps)
+#Cytochrome oxidase subunit 1 search keys:
+#co1[gene] AND mitochondrion[filter]
+#coi[gene] AND mitochondrion[filter]
+#cox1[gene] AND mitochondrion[filter]
+#coxi[gene] AND mitochondrion[filter]
+#("Cytochrome oxidase subunit 1[gene]" only two hits)
+#Ribulose bisphosphate carboxylase large chain
+#rbcl[gene] AND (plastid[filter] OR chloroplast[filter])
+#rubisco[gene] AND (chloroplast[filter] OR plastid[filter])
+#Download with long header:
+#send to: coding sequences" e.g. as file named sequence-coxi.fasta"
+################################################################################
+# INPUTS
+DIR_SEQ=/home/mohamed/work/RAHMA/
+DIR_TMP=/home/mohamed/work/RAHMA
+################################################################################
+# concatenate files
+cat $DIR_SQ/sequence* > $DIR_TMP/pre-all-sequences.fasta
+#cat ~/marker-db/all_sequences/sequence* > pre-all-sequences.fasta
+# extract list of relevant hits grep ^">" pre-all-sequences.fasta | grep -i "gene=coxi" | grep -v -i "gene=coxii" > pre-all-ids.txt
+grep ^">" $DIR_TMP/pre-all-sequences.fasta | grep -i "gene=coi" | \
+    grep -v -i "gene=coii" >> $DIR_TMP/pre-all-ids.txt
+grep ^">" $DIR_TMP/pre-all-sequences.fasta | grep -i "gene=cox1" | \
+    grep -v -i "gene=cox1[0-9]" | grep -v -i "gene=cox1[a-z]" >> $DIR_TMP/pre-all-ids.txt
+#grep ^">" pre-all-sequences.fasta | grep -i "gene=coi" | grep -v -i "gene=coii" >> pre-all-ids.txt
+#grep ^">" pre-all-sequences.fasta | grep -i "gene=cox1" | grep -v -i "gene=cox1[0-9]" | grep -v -i "gene=cox1[a-z]" >> pre-all-ids.txt
+# last step removes introns that are ususally called cox1i[1-9]
+grep ^">" $DIR_TMP/pre-all-sequences.fasta | grep -i "gene=co1" | \
+    grep -v -i "gene=co1[0-9]" >> $DIR_TMP/pre-all-ids.txt
+#grep ^">" pre-all-sequences.fasta | grep -i "gene=co1" | grep -v -i "gene=co1[0-9]" >> pre-all-ids.txt
+# remove spaces and ">" for further processing of ID list
+cut -d" " -f1 $DIR_TMP/pre-all-ids.txt > $DIR_TMP/cut-pre-all-ids.txt
+cut -d">" -f2 $DIR_TMP/cut-pre-all-ids.txt > $DIR_TMP/all-ids.txt
+# this list is still redundant (but cannot be shortend because long headers needed for extraction -> two accesions, one for genome, second for gene, later only the one of genome used)
+rm $DIR_TMP/pre-all-ids.txt
+rm $DIR_TMP/cut-pre-all-ids.txt
+# remove spaces from header of sequences
+cut -d" " -f1 $DIR_TMP/pre-all-sequences.fasta > $DIR_TMP/all-sequences.fasta
+rm $DIR_TMP/pre-all-sequences.fasta
+# extract relevant sequences from merged file of all downloaded sequences
+# program seqtk
+seqtk subseq $DIR_TMP/all-sequences.fasta all-ids.txt > markerWanted.fasta
+# remove redundant sequnces
+# cut o accessions of genes, leaving accessions of genomes
+awk -F' cds' 'fprint $1g' markerWanted.fasta > cut-marker.fasta
+## print all rows in two colums
+awk 'NR==1fprintf $0"nt";nextgfprintf /^>/ ? "nn"$0"nt" : $0g' cut-marker.fasta > col-cut-marker.fasta
+# print lengths of sequences
+awk 'f print length($2),$0 g' col-cut-marker.fasta > len-col-cut-marker.fasta
+# sort by accession, then sort by sequence lengths within aligned accessions, uniq, keep longest sequence
+sort -k2,2 -k1,1nr len-col-cut-marker.fasta > sort-len-col-cut-marker.fasta # does sort recognize numbers properly?
+sort -u -k2,2 sort-len-col-cut-marker.fasta > uniq-sort-len-col-cut-marker.fasta
+# cut o sequence length:
+cut -d" " -f2 uniq-sort-len-col-cut-marker.fasta > cut-uniq-sort-len-col-cut-marker.fasta
+# print sequence under header again:
+awk '/^>/fprint $1; print $2; nextgfprintg' cut-uniq-sort-len-col-cut-marker.fasta > db-marker.fasta
+rm *cut-marker.fasta
+rm markerWanted.fasta
+rm all-sequences.fasta
+rm all-ids.txt