Commit 1290afb1 authored by Dr. Mohamed Mohamed Hefny Salim's avatar Dr. Mohamed Mohamed Hefny Salim
Browse files

add several scripts

parent a7dd8838
% Abundance_individual_spceis calculates the abundance (occurance, RRA,
% etc) for each spcies ('ALCES', 'COMP', 'COMBINED', 'CURVI', 'NUM',
% 'TSHO', 'PET')
%
% Copyright (C) Rahma Amen 2021 Potsdam University
tic
clear
close all
addpath(genpath('~/Matlab_Functions/'))
%% ============================== INPUTS =============================== %%
% Folders
KRAKEN_REPORT = '/home/mohamed/work/RAHMA/KRAKEN_REPORTS/';
ABUNDANCE_DIR = '/home/mohamed/work/RAHMA/ABUNDANCE/';
PLOT_DIR = '/home/mohamed/work/RAHMA/ABUNDANCE/PLOTS/';
dir_pavian = '/home/mohamed/work/RAHMA/tmp/'; % folder of pavian csv files of top 100 taxa (C, O, F, G, S)
% Settings
TaxaRankName = 'Species'; % 'Class' or 'Order' or 'Family' or 'Genus' or 'Species'
l_write_Abundance = 1; % Write the abundance text file
l_clean_Abundance = 1; % clean the abundance output folder
l_individual = 0; % Plot the figures for individual level
l_plot_pooled = 1; % Plot the figures for pooled level
l_figsave = 1; % save the figures
l_num = 0; % include c. num in choosing the common taxa
%% =================== T A X A S E T T I N G S ======================== %%
switch TaxaRankName
case 'Class'
TaxaRank = 'C';
nTaxa = 10;
case 'Order'
TaxaRank = 'O';
nTaxa = 15;
case 'Family'
TaxaRank = 'F';
nTaxa = 15;
case 'Genus'
TaxaRank = 'G';
nTaxa = 15;
case 'Species'
TaxaRank = 'S';
nTaxa = 15;
otherwise
error('Not implemented yet.')
end
if l_individual
CASES = {'ALCES','COMP','COMBINED','CURVI','NUM','TSHO','PET'};
else
CASES = {'COMBINED'};
end
Food_Taxa_list = find_commen_taxa(dir_pavian,TaxaRank,nTaxa,l_num);
Food_Taxa_list = replace(Food_Taxa_list, char(160), char(32)); % This is very important to convert space with 160 char to 32 to compare spcies
% Check that l_clean_Abundance is set intentionaly and clean accordingly
if l_clean_Abundance
answer = questdlg('Would you like to delet Abundance folder?', ...
'Abundance Folder', ...
'Yes','No thank you','No thank you');
% Handle response
switch answer
case 'Yes'
cmd = ['rm -r ' ABUNDANCE_DIR '/*']; system(cmd);
case 'No thank you'
disp('OK. l_clean_Abundance is set to False')
end
end
% Create Abundance folder
if l_write_Abundance
if ~exist(ABUNDANCE_DIR, 'dir')
cmd = ['mkdir ' ABUNDANCE_DIR]; system(cmd);
end
end
%% Analysis
% loop for each species and the pooled cases
for i_case = 1:length(CASES)
% Create sub-folder for each species
DIR = [KRAKEN_REPORT CASES{i_case} '/'];
DIR_OUT = [ABUNDANCE_DIR CASES{i_case} '/'];
if l_write_Abundance
if ~exist(DIR_OUT, 'dir')
cmd = ['mkdir ' DIR_OUT]; system(cmd);
end
end
REPORTS = dir([DIR '*_report']);
ABUNDANCE = zeros(length(REPORTS),length(Food_Taxa_list));
% Initiate the species of the pooled cases
if strcmp(CASES{i_case},'COMBINED')
SPECIS = strings(1,length(REPORTS));
% Note: SPECIS_NAME is not used because the text is not visiable
% below the figure. We decided to add it manually using Inkscape
SPECIS_NAME = strings(1,length(REPORTS));
end
% loop for each KRAKEN report
for i_report = 1:length(REPORTS)
T = readtable( [DIR REPORTS(i_report).name]);
disp(['Processing the case: ' REPORTS(i_report).name])
% Define the species of the pooled cases
if strcmp(CASES{i_case},'COMBINED')
SPECIS(i_report)=erase(REPORTS(i_report).name,"_COMBINED_report");
end
% Open abundance report
if l_write_Abundance
if strcmp(CASES{i_case},'COMBINED')
fname = [DIR_OUT SPECIS(i_report) '_' TaxaRankName];
else
fname = [DIR_OUT erase(REPORTS(i_report).name,"report") TaxaRankName];
end
fileID = fopen(join(fname,""),'w'); % join is used here to convert to text
end
% loop for each food taxa
for i_food = 1:length(Food_Taxa_list)
n_class = 0;
% loop for each line in KRAKEN report
for i_line = 1:length(T.Var1)
if strcmp(T.Var4(i_line),TaxaRank) && ...
strcmp(T.Var6{i_line},Food_Taxa_list{i_food})
n_class = T.Var2(i_line);
break
end
end
% fill CASSES array
ABUNDANCE(i_report,i_food) = n_class;
if l_write_Abundance
fprintf(fileID,'%-14s %-12d\n',Food_Taxa_list{i_food}, ...
ABUNDANCE(i_report,i_food));
end
end
end
% Calculate
COUNT = ABUNDANCE;
COUNT(ABUNDANCE > 0) = 1;
COUNT_WEIGHT = COUNT./sum(COUNT,2)*100;
RRA = ABUNDANCE./sum(ABUNDANCE,2)*100;
switch CASES{i_case}
case 'ALCES'
ABUNDANCE_ALCES = ABUNDANCE;
COUNT_ALCES = COUNT;
COUNT_WEIGHT_ALCES = COUNT_WEIGHT;
RRA_ALCES = RRA;
case 'COMP'
ABUNDANCE_COMP = ABUNDANCE;
COUNT_COMP = COUNT;
COUNT_WEIGHT_COMP = COUNT_WEIGHT;
RRA_COMP = RRA;
case 'COMBINED'
ABUNDANCE_COMBINED = ABUNDANCE;
COUNT_COMBINED = COUNT;
COUNT_WEIGHT_COMBINED = COUNT_WEIGHT;
RRA_COMBINED = RRA;
case 'CURVI'
ABUNDANCE_CURVI = ABUNDANCE;
COUNT_CURVI = COUNT;
COUNT_WEIGHT_CURVI = COUNT_WEIGHT;
RRA_CURVI = RRA;
case 'NUM'
ABUNDANCE_NUM = ABUNDANCE;
COUNT_NUM = COUNT;
COUNT_WEIGHT_NUM = COUNT_WEIGHT;
RRA_NUM = RRA;
case 'PET'
ABUNDANCE_PET = ABUNDANCE;
COUNT_PET = COUNT;
COUNT_WEIGHT_PET = COUNT_WEIGHT;
RRA_PET = RRA;
case 'TSHO'
ABUNDANCE_TSHO = ABUNDANCE;
COUNT_TSHO = COUNT;
COUNT_WEIGHT_TSHO = COUNT_WEIGHT;
RRA_TSHO = RRA;
end
end
%% Plot figure
if ~exist(PLOT_DIR, 'dir')
cmd = ['mkdir ' PLOT_DIR]; system(cmd);
end
for i_case = 1:length(CASES)
if strcmp(CASES{i_case},'COMBINED')
continue
end
switch CASES{i_case}
case 'ALCES'
ABUNDANCE = ABUNDANCE_ALCES;
COUNT = COUNT_ALCES;
COUNT_WEIGHT = COUNT_WEIGHT_ALCES;
RRA = RRA_ALCES;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'ALCES')
i_sp = i_specis;
SPECIS_NAME{i_sp}='C. alces';
break
end
end
case 'COMP'
ABUNDANCE = ABUNDANCE_COMP;
COUNT = COUNT_COMP;
COUNT_WEIGHT = COUNT_WEIGHT_COMP;
RRA = RRA_COMP;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'COMP')
i_sp = i_specis;
SPECIS_NAME{i_sp}='C. compressirostris';
break
end
end
case 'CURVI'
ABUNDANCE = ABUNDANCE_CURVI;
COUNT = COUNT_CURVI;
COUNT_WEIGHT = COUNT_WEIGHT_CURVI;
RRA = RRA_CURVI;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'CURVI')
i_sp = i_specis;
SPECIS_NAME{i_sp}='C. curvirostris';
break
end
end
case 'NUM'
ABUNDANCE = ABUNDANCE_NUM;
COUNT = COUNT_NUM;
COUNT_WEIGHT = COUNT_WEIGHT_NUM;
RRA = RRA_NUM;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'NUM')
i_sp = i_specis;
SPECIS_NAME{i_sp}='C. numenius';
break
end
end
case 'PET'
ABUNDANCE = ABUNDANCE_PET;
COUNT = COUNT_PET;
COUNT_WEIGHT = COUNT_WEIGHT_PET;
RRA = RRA_PET;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'PET')
i_sp = i_specis;
SPECIS_NAME{i_sp}='G. petersii';
break
end
end
case 'TSHO'
ABUNDANCE = ABUNDANCE_TSHO;
COUNT = COUNT_TSHO;
COUNT_WEIGHT = COUNT_WEIGHT_TSHO;
RRA = RRA_TSHO;
for i_specis = 1:length(SPECIS)
if strcmp(SPECIS{i_specis},'TSHO')
i_sp = i_specis;
SPECIS_NAME{i_sp}='C. tshokwe';
break
end
end
end
% individual plots
if l_individual
fname = [PLOT_DIR CASES{i_case} '_' TaxaRankName '_COUNT.pdf'];
COUNT(end+1,:) = COUNT_COMBINED(i_sp,:);
plot_abundance(COUNT,fname,'Count',size(COUNT_COMBINED,2), ...
Food_Taxa_list,0,0,l_figsave)
fname = [PLOT_DIR CASES{i_case} '_' TaxaRankName '_COUNT_WEIGHT.pdf'];
COUNT_WEIGHT(end+1,:) = COUNT_WEIGHT_COMBINED(i_sp,:);
plot_abundance(COUNT_WEIGHT,fname,'Occurrence, %',100, ...
Food_Taxa_list,0,0,l_figsave)
fname = [PLOT_DIR CASES{i_case} '_' TaxaRankName '_RRA.pdf'];
RRA(end+1,:) = RRA_COMBINED(i_sp,:);
plot_abundance(RRA,fname,'Relative read abundance, %',100, ...
Food_Taxa_list,0,0,l_figsave)
end
end
%%
% Pooled plots
close all
if l_plot_pooled
fname = [PLOT_DIR 'COMBINED' '_' TaxaRankName '_RRA.pdf'];
plot_abundance(RRA_COMBINED,fname,'Relative read abundance, %',100, ...
Food_Taxa_list,0,1,l_figsave)
fname = [PLOT_DIR 'COMBINED' '_' TaxaRankName '_RRA_Legend.pdf'];
plot_abundance(RRA_COMBINED,fname,'Relative read abundance, %',100, ...
Food_Taxa_list,1,1,l_figsave)
end
toc
function plot_abundance(ab,fn,ylbl,ylm,lgd,l_lgd,l_p,l_save)
fz = 26; fzlgnd = 12; lw = 1.5;
h = figure('units','normalized','outerposition',[0 0 1 1]);
bh = bar(ab,'stacked');
% Change the colors of the stacked bars
set(bh, 'FaceColor', 'Flat')
% clrs = mat2cell(distinguishable_colors(numel(bh)),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
% clrs = mat2cell(linspecer(numel(bh)),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
clrs = mat2cell(cbrewer('div','Spectral',numel(bh),'PCHIP'),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
set(bh, {'CData'}, flip(clrs))
% Remove x axis
set(gca,'XTick',[])
set(gca,'XColor','white')
bh(1).BaseLine.Visible = 'off';
box off
% White figure
set(gcf,'InvertHardCopy','off','Color','white');
ylim([0 ylm])
% Add * for combined bar
if l_p < 1
text(size(ab,1),0,'*','HorizontalAlignment','center', ...
'VerticalAlignment','top','FontSize',fz);
end
% adjust axis
set(gca,'FontSize',fz); set(gca,'linewidth',lw,'FontName','Times')
if l_p < 1
xlabel('Samples','FontName','Times','FontSize',fz,'Color','black')
end
ylabel(ylbl,'FontName','Times','FontSize',fz)
% legend
if l_lgd
legend(lgd,'Location','northeastoutside','FontSize',fzlgnd)
end
%save file
if l_save
export_fig(h,fn,'-m2')
close(h)
end
end
function cns = find_commen_taxa(dir_pavian,taxa,ntaxa,lnum)
if lnum
cases = {'ALCES','COMP','CURVI','NUM','TSHO','PET'};
else
cases = {'ALCES','COMP','CURVI','TSHO','PET'};
end
for i = 1:length(cases)
f = [dir_pavian cases{i} '-results-211107_' taxa '.csv'];
Table = readtable(f);
n = min(ntaxa, length(Table.Name));
TaxaCases(i,1:n) = Table.Name(1:n);
TaxaNumbr(i,1:n) = Table.CladeReads(1:n);
end
c = unique(TaxaCases);
c(cellfun('isempty',c)) = [];
cn = zeros(length(c),2);
for i = 1:length(cases)
for j = 1:length(c)
cn(j,2) = j;
for k = 1:ntaxa
if strcmp(c{j},TaxaCases{i,k})
cn(j,1) = cn(j,1) + str2double(TaxaNumbr(i,k));
end
end
end
end
b = sortrows(cn,1,'descend');
for i = 1:length(c)
cns{i} = c{b(i,2)};
end
end
% Abundance_all_spceis calculates the abundance (occurance, RRA, etc) for
% all the spcies ('ALCES','COMP','COMBINED','CURVI','NUM','TSHO','PET')
%
% Copyright (C) Rahma Amen 2021 Potsdam University
tic
clear
close all
addpath(genpath('~/Matlab_Functions/'))
%% ============================== INPUTS =============================== %%
% Folders
KRAKEN_REPORT = '/home/mohamed/work/RAHMA/KRAKEN_REPORTS/';
ABUNDANCE_DIR = '/home/mohamed/work/RAHMA/ABUNDANCE/';
PLOT_DIR = '/home/mohamed/work/RAHMA/ABUNDANCE/PLOTS/';
PAVIAN_DIR = '/home/mohamed/work/RAHMA/PAVIAN_CSV/'; % folder of pavian csv files of top 100 taxa (C, O, F, G, S)
% Settings
TaxaRankName = 'Class'; % 'Class' or 'Order' or 'Family' or 'Genus' or 'Species'
l_write_Abundance = 1; % Write the abundance text file
l_clean_Abundance = 1; % clean the abundance output folder
l_figsave = 1; % save the figures
l_numenius = 0; % include c. num in choosing the common taxa
%% =================== T A X A S E T T I N G S ======================== %%
switch TaxaRankName
case 'Class'
TaxaRank = 'C';
nTaxa = 10;
case 'Order'
TaxaRank = 'O';
nTaxa = 15;
case 'Family'
TaxaRank = 'F';
nTaxa = 15;
case 'Genus'
TaxaRank = 'G';
nTaxa = 15;
case 'Species'
TaxaRank = 'S';
nTaxa = 15;
otherwise
error('Not implemented yet.')
end
CASES = {'COMBINED'};
Food_Taxa_list = find_commen_taxa(PAVIAN_DIR,TaxaRank,nTaxa,l_numenius);
Food_Taxa_list = replace(Food_Taxa_list, char(160), char(32)); % This is very important to convert space with 160 char to 32 to compare spcies
% Check that l_clean_Abundance is set intentionaly and clean accordingly
if l_clean_Abundance
answer = questdlg('Would you like to delet Abundance folder?', ...
'Abundance Folder', ...
'Yes','No thank you','No thank you');
% Handle response
switch answer
case 'Yes'
cmd = ['rm -r ' ABUNDANCE_DIR '/COMBINED']; system(cmd);
case 'No thank you'
disp('OK. l_clean_Abundance is set to False')
end
end
% Create Abundance folder
if l_write_Abundance
if ~exist(ABUNDANCE_DIR, 'dir')
cmd = ['mkdir ' ABUNDANCE_DIR]; system(cmd);
end
end
%% Analysis
% loop for each species and the pooled cases
% for i_case = 1:length(CASES)
% Create sub-folder for each species
DIR = [KRAKEN_REPORT CASES{1} '/'];
DIR_OUT = [ABUNDANCE_DIR CASES{1} '/'];
if l_write_Abundance
if ~exist(DIR_OUT, 'dir')
cmd = ['mkdir ' DIR_OUT]; system(cmd);
end
end
REPORTS = dir([DIR '*_report']);
ABUNDANCE = zeros(length(REPORTS),length(Food_Taxa_list));
% Initiate the species of the pooled cases
SPECIS = strings(1,length(REPORTS));
% Note: SPECIS_NAME is not used because the text is not visiable
% below the figure. We decided to add it manually using Inkscape
SPECIS_NAME = strings(1,length(REPORTS));
% loop for each KRAKEN report
for i_report = 1:length(REPORTS)
T = readtable( [DIR REPORTS(i_report).name]);
disp(['Processing the case: ' REPORTS(i_report).name])
SPECIS(i_report)=erase(REPORTS(i_report).name,"_COMBINED_report");
% Open abundance report
if l_write_Abundance
fname = [DIR_OUT SPECIS(i_report) '_' TaxaRankName];
fileID = fopen(join(fname,""),'w'); % join is used here to convert to text
end
% Combine case: loop for each food taxa
for i_food = 1:length(Food_Taxa_list)
n_class = 0;
% loop for each line in KRAKEN report
for i_line = 1:length(T.Var1)
if strcmp(T.Var4(i_line),TaxaRank) && ...
strcmp(T.Var6{i_line},Food_Taxa_list{i_food})
n_class = T.Var2(i_line);
break
end
end
% fill CASSES array
ABUNDANCE(i_report,i_food) = n_class;
if l_write_Abundance
fprintf(fileID,'%-14s %-12d\n',Food_Taxa_list{i_food}, ...
ABUNDANCE(i_report,i_food));
end
end
end
% Calculate count, count_weight, RRA
COUNT_COMBINED = ABUNDANCE;
COUNT_COMBINED(ABUNDANCE > 0) = 1;
COUNT_WEIGHT_COMBINED = COUNT_COMBINED./sum(COUNT_COMBINED,2)*100;
RRA_COMBINED = ABUNDANCE./sum(ABUNDANCE,2)*100;
%% Plot figure
if ~exist(PLOT_DIR, 'dir')
cmd = ['mkdir ' PLOT_DIR]; system(cmd);
end
%% ========================= P L O T S ================================%%
fname = [PLOT_DIR 'COMBINED' '_' TaxaRankName '_RRA.pdf'];
plot_abundance(RRA_COMBINED,fname,'Relative read abundance, %',100, ...
Food_Taxa_list,0,1,l_figsave)
fname = [PLOT_DIR 'COMBINED' '_' TaxaRankName '_RRA_Legend.pdf'];
plot_abundance(RRA_COMBINED,fname,'Relative read abundance, %',100, ...
Food_Taxa_list,1,1,l_figsave)
toc
function plot_abundance(ab,fn,ylbl,ylm,lgd,l_lgd,l_p,l_save)
fz = 26; fzlgnd = 12; lw = 1.5;
h = figure('units','normalized','outerposition',[0 0 1 1]);
bh = bar(ab,'stacked');
% Change the colors of the stacked bars
set(bh, 'FaceColor', 'Flat')
% clrs = mat2cell(distinguishable_colors(numel(bh)),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
% clrs = mat2cell(linspecer(numel(bh)),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
clrs = mat2cell(cbrewer('div','Spectral',numel(bh),'PCHIP'),ones(numel(bh),1),3); % here we need the external function distinguishable_colors, otherwise use jet
set(bh, {'CData'}, flip(clrs))
% Remove x axis
set(gca,'XTick',[])
set(gca,'XColor','white')
bh(1).BaseLine.Visible = 'off';
box off
% White figure
set(gcf,'InvertHardCopy','off','Color','white');
ylim([0 ylm])
% Add * for combined bar
if l_p < 1
text(size(ab,1),0,'*','HorizontalAlignment','center', ...
'VerticalAlignment','top','FontSize',fz);
end