From b8a40feed6694ee76731d17e65eb1539882f8486 Mon Sep 17 00:00:00 2001 From: domenico-somma <34346930+domenico-somma@users.noreply.github.com> Date: Sun, 4 Mar 2018 20:25:55 +0000 Subject: [PATCH 1/8] Dev version improved code, now the genes selected can be stored in a variable --- CHANGE.txt | 7 + TO DO.txt | 1 - docs/manual.rst | 2 +- papillon.py | 421 +++++++++++++++++++----------------------- setup.py | 2 +- test/test_papillon.py | 321 ++++++++++++++++---------------- 6 files changed, 357 insertions(+), 397 deletions(-) diff --git a/CHANGE.txt b/CHANGE.txt index 550bd0c..f1c361b 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,3 +1,10 @@ +v 0.2.0, -18 -- Major changes: +Now you can keep the genes subselection in a variable +self.read_db() deprecated +self.plot() deprecated +Add check if papillon folder is removed +Improved code (Single Responsibility Principle) + v 0.1.1, 21-01-18 -- Minor changes: add python 3 restriction, add the function to load directly the cuffdiff files, changed function names: self.read_db() -> self.read_folder() and self.plot() -> self.lineplot(),improved code quality (tried to follow the single responsibility principle), removed some potential bugs. diff --git a/TO DO.txt b/TO DO.txt index ea7fecf..5ea5638 100644 --- a/TO DO.txt +++ b/TO DO.txt @@ -5,7 +5,6 @@ * plot genes detected/significant as different lines * Compare self.selected between two different experiments -* Move plots functions in another py file? * Wondering if z_score calc, fusion_gene_id and only_FPKM should be only one function? * Add Volcano plot diff --git a/docs/manual.rst b/docs/manual.rst index 6fee3c0..1fd45a4 100644 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -1,4 +1,4 @@ -Papillon is a python alternative to cummeRbund to read and plot Galaxy +Papillon is a python alternative to cummeRbund to read and plot cuffdiff/Galaxy RNA-seq data. To start diff --git a/papillon.py b/papillon.py index 53f42a7..c8042b2 100644 --- a/papillon.py +++ b/papillon.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -"""A python version of CummeRbund -to read and plot Galaxy/cuffdiff RNA-seq data""" +"""A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq data""" import os import warnings @@ -18,12 +17,9 @@ def read_db(path, drop_comparison=None): - "Use read_folder() instead. read_db() will not work in the future" + """Deprecated. Use read_folder()""" warnings.warn( - 'Use read_folder() instead. read_db() will not work in the future', DeprecationWarning) - if drop_comparison is None: - drop_comparison = [] - return read_folder(path, drop_comparison) + "read_db() deprecated. Use read_folder() instead.", DeprecationWarning) def read_folder(path, drop_comparison=None): @@ -35,7 +31,6 @@ def read_folder(path, drop_comparison=None): re-calculate significant genes/isoforms""" if drop_comparison is None: drop_comparison = [] - try: isoform_fpkm = pd.read_csv( str(path + "/isoforms.fpkm_tracking"), @@ -104,6 +99,9 @@ def read_files(files, path=None, drop_comparison=None): return _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, drop_comparison) +def _make_folder(path): + if not os.path.exists(path): + os.makedirs(path) def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, drop_comparison): """Accept cuffdiff/cummeRbund files, check whether the files are correct, @@ -135,9 +133,7 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr path = "Papillion/" else: path = str(path + "/Papillon/") - if not os.path.exists(path): - os.makedirs(path) - # TO DO - add check if the folder is moved after creation + _make_folder(path) print("Creating dataframe...") # find samples name (using isoforms, but it's the same with genes) @@ -184,7 +180,7 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr isoforms_detect, isoforms_significant = _generate_df( "isoform", samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons) - return Papillon(path, samples, comparisons, genes_detect, genes_significant, isoforms_detect, isoforms_significant) + return Papillon_db(path, samples, comparisons, genes_detect, genes_significant, isoforms_detect, isoforms_significant) # Not working now - to fix to easy add features # if __name__ == "__main__": @@ -314,19 +310,12 @@ def __str__(self): e = "Isoform Detected: " + str(len(self.isoforms_detect)) + "\n" f = "Isoform differential expressed: " + \ str(len(self.isoforms_significant)) + "\n" - try: - g = str(len(self.selected)) + " " + \ - self.type_selected + " selected\n" - except AttributeError: - g = "None of the genes is selected" - visual = a + b + c + d + e + f + g + visual = a + b + c + d + e + f return visual @staticmethod def _significant(df_detected, comparison, what): - """Users should not use this function directly. - Calculate significant expressed genes. - """ + """Calculate significant expressed genes.""" if what not in ["gene", "isoform"]: raise Exception("what= not known") df_significant = df_detected[ @@ -335,10 +324,7 @@ def _significant(df_detected, comparison, what): return df_significant def _compare(self): - """ - Users should not use this function directly. - Compare genes and isoforms significantly expressed - """ + """Compare genes and isoforms significantly expressed""" genes = list(self.genes_significant["gene_short_name"]) isoforms = list(self.isoforms_significant["gene_short_name"]) genes_not_found = [] @@ -367,28 +353,6 @@ def _compare(self): print(set(isoforms_not_found)) return genes_not_found, isoforms_not_found, n # Only for tests so far. - def selected_exist(self, remove=False): - """Check if self.selected exists - - remove: True/False. If True remove self.selected and self.type_selected - """ - if remove is True: - try: - del self.selected - del self.type_selected - except AttributeError: - pass - elif remove is False: - try: - self.selected - return True - except AttributeError: - raise Exception("No gene selected") - else: - raise Exception("Remove= value not known") - - # Modify functions: - def dropComparison(self, comparison): """Drop Comparison (str) or list of comparisons and re-calculate df_significant @@ -403,7 +367,6 @@ def dropComp(comp): del self.genes_detect[comp] del self.genes_detect[str("q-value_" + comp)] self.comparison.remove(comp) - self.selected_exist(remove=True) print(comp, " removed") else: raise Exception(comp, " not found, please double check it") @@ -425,7 +388,6 @@ def change_order(self, new_order): """Change the samples order new_order: list of samples order""" - self.selected_exist(remove=True) n_sampl = len(self.samples) if len(new_order) != n_sampl: raise Exception("Number of samples doesn't match") @@ -440,124 +402,29 @@ def change_order(self, new_order): self.genes_significant = self.genes_significant[cols] self.isoforms_detect = self.isoforms_detect[cols] self.isoforms_significant = self.isoforms_significant[cols] - - -class Papillon(Papillon_db): - """Select and plot genes/isoforms from a Papillon_db - - self.selected - gene/isoform selected - self.type_selected - either gene or isoform according with selection type - """ - - # Select genes functions - + def _select(self, genelist, what, comparison, sign): - """Users should not use this function directly. - Part of get_gene/get_isoform function - """ + """Part of get_gene/get_isoform function""" + if what != "gene" and what != "isoform": raise Exception("Only what=gene or what=isoform admitted") - self.type_selected = what gene_list = _obtain_list(genelist, path=self.path) if what == "gene": df = pd.DataFrame.copy(self.genes_significant) elif what == "isoform": df = pd.DataFrame.copy(self.isoforms_significant) - + + n=0 if gene_list != []: - df["Selected"] = [ - True if name in gene_list else False for name in df["gene_short_name"]] + df["Selected"] = [True if name in gene_list else False for name in df["gene_short_name"]] df = df[df["Selected"] == True].iloc[:, :-1] for name in gene_list: if name not in list(df["gene_short_name"]): print("Gene name not found:\t", name) - self.selected = df.copy() - self._sub_select(comparison, sign) - - def _sub_select(self, comparison, sign): - """Users should not use this function directly. - Part of get_gene/get_isoform function - """ - ACCEPTED_SIGN = [">", "<", None] - - if comparison is None: - if sign is None: - return - elif sign is not None: - raise Exception("Sign passed, but not comparison") - elif comparison is not None: - if sign not in ACCEPTED_SIGN: - raise Exception('Only ">" "<" usable.') - if comparison not in self.comparison: - raise Exception("Comparison not found") - self.selected = self.selected[self.selected[comparison] == True] - sample1, sample2 = _vs(comparison) - if sign == ">": - self.selected = self.selected[self.selected[ - _FPKM(sample1)] > self.selected[_FPKM(sample2)]] - elif sign == "<": - self.selected = self.selected[self.selected[_FPKM( - sample1)] < self.selected[_FPKM(sample2)]] - return - - def get_gene(self, genelist=None, comparison=None, sign=None, export=False): - """This function select genes. It creates - - self.selected - self.type_selected="gene". - - genelist - accept string (gene name), list of gene names or file - with a list of gene names - comparison - accept only 1 comparison as str (already present in - the data) - sign - usable in combination with comparison, accept either ">" or - "<" - export - True/False whether want or not export the dataframe of - selected genes - """ - self._select(genelist, "gene", comparison, sign) - # self.selected.set_index(["gene_short_name"], inplace=True, - # verify_integrity=True) #I don't know if could be useful - print("\nNumber of gene selected: ", len(self.selected)) - self._export(self.selected, name="selected_gene", export=export) - # maybe should return selected to the class? - # To do - Return number genes not found - - def get_isoform(self, genelist=None, comparison=None, sign=None, export=False, show_dup=False): - """This function select isoforms. It creates - self.selected - self.type_selected="isoform" - - genelist - accept string (gene name), list of gene names or file - with a list of gene names - comparison - accept only 1 comparison as str (already present in - the data) - sign - usable in combination with comparison, accept either ">" or - "<" - export - True/False whether want or not export the dataframe of - selected genes - show_dup - True/False whether want or not highlight duplicated - isoforms for the same gene""" - self._select(genelist, "isoform", comparison, sign) - - try: - del self.selected["duplicate"] - except KeyError: - pass - - if show_dup is True: - self.selected["duplicate"] = self.selected.duplicated( - "gene_short_name", keep=False) - else: - pass - # TO DO if remove_dup == True: # it'd remove the one with lower - # q-value. and if the q-values are the same??? - - print("\nNumber of isoform selected: ", len(self.selected)) - self._export(self.selected, name="selected_isoform", export=export) - # maybe should return selected to the class? - # To do - Return number isoforms not found - + n+=1 + print("Number of gene not found: ",n) + return df + def search(self, word, where, how="table", export=False): """search among genes/isoforms names in detected and significant @@ -629,39 +496,152 @@ def df_or_list(df_, how_): self._export(found, export=export, name="search_result") return found + + def get_gene(self, genelist=None, comparison=None, sign=None): + """This function select genes. It creates a Papillon object - def _export(self, thing, export, name=None, image_extension=".png"): # add .pdf? + genelist - accept string (1 gene name), list of gene names or file + with a list of gene names + comparison - accept only 1 comparison as str (already present in + the data) + sign - usable in combination with comparison, accept either ">" or + "<" + """ + df = self._select(genelist, "gene", comparison, sign) + return Papillon(df, genelist, "gene", comparison, sign, self.path, self.samples) + # To do - Return number genes not found + + def get_isoform(self, genelist=None, comparison=None, sign=None): + """This function select isoforms. It creates a Papillon object + + genelist - accept string (gene name), list of gene names or file + with a list of gene names + comparison - accept only 1 comparison as str (already present in + the data) + sign - usable in combination with comparison, accept either ">" or + "<" + export - True/False whether want or not export the dataframe of + selected genes""" + df = self._select(genelist, "isoform", comparison, sign) + return Papillon(df, genelist, "isoform", comparison, sign, self.path, self.samples) + + def _export(self, thing, export, name=None): """Manage dataframe or image export parameter.""" if export is False: return elif export is True: - try: - thing.to_excel(str(self.path + name + '.xls'), - sheet_name='Sheet1') - print("\nExported as " + name + ".xls\n") - except AttributeError: - pass - try: - thing.savefig(str(self.path + name + image_extension)) - print("\nExported as " + name + image_extension) - except: - raise Exception("Export error") + _make_folder(self.path) + thing.to_excel(str(self.path + name + '.xls'), sheet_name='Sheet1') + print("\nExported as " + name + ".xls\n") else: raise Exception("export= can be only 'False' or 'True'") - # Plot functions - should it be another class? + +class Papillon: + def __init__(self, df, genelist, what, comparison, sign, path, samples): + self.df=df + self.selected=df.copy() + self.genelist=genelist + self.what=what + self.comparison=comparison + self.sign=sign + self.path=path + self.samples=samples + self._sub_select(comparison, sign) + + def _sub_select(self, comparison, sign): + """ Part of get_gene/get_isoform function""" + + ACCEPTED_SIGN = [">", "<", None] + + if comparison is None: + if sign is None: + return + elif sign is not None: + raise Exception("Sign passed, but not comparison") + elif comparison is not None: + if sign not in ACCEPTED_SIGN: + raise Exception('Only ">" "<" usable.') + if comparison not in self.comparison: + raise Exception("Comparison not found") + self.selected = self.selected[self.selected[comparison] == True] + sample1, sample2 = _vs(comparison) + if sign == ">": + self.selected = self.selected[self.selected[ + _FPKM(sample1)] > self.selected[_FPKM(sample2)]] + elif sign == "<": + self.selected = self.selected[self.selected[_FPKM( + sample1)] < self.selected[_FPKM(sample2)]] + print("\nNumber of ", self.what," selected: ", len(self.selected)) + + def __str__(self): + a = "Samples: " + str(self.samples) + "\n" + b = "Comparison: " + str(self.comparison) + "\n" + c = "Number of "+ self.what + " selected: "+ str(len(self.df)) + "\n" + visual = a + b + c + return visual + + + def _export(self, thing, export, name=None): + """Manage dataframe or image export parameter.""" + if export is False: + return + elif export is True: + _make_folder(self.path) + thing.to_excel(str(self.path + name + '.xls'), sheet_name='Sheet1') + print("\nExported as " + name + ".xls\n") + else: + raise Exception("export= can be only 'False' or 'True'") + + def __getattr__(self, arg): + _plot=_Plot() + setattr(_plot, "pp", self) + return getattr(_plot, arg) + + def swap_gene_isoform(): + pass + #TO DO + + def __add__(): + pass + #TO DO + + def __radd__(): + pass + #Maybe TO DO + + def __import_excel(self, filename, type_selected): + """Only for testing. Users should not use this function directly""" + if type_selected not in ["gene", "isoform"]: + raise Exception("type_selected can be only 'gene' or 'isoform'") + self.type_selected = type_selected + filename = str(self.path + filename) + self.selected = pd.read_excel(filename, index_col=0) + try: + del self.selected["duplicate"] + except KeyError: + pass + if __name__ == "__main__": + print( + self.selected.index, + self.selected.columns, + self.selected.head(), + len(self.selected.columns) + ) + +class _Plot: + """ """ @staticmethod - def _fusion_gene_id(df, type_selected, change_index=False): - """Users should not use this function directly. - Append a "gene/ID" column to the dataframe, and use gene + def _fusion_gene_id(df, what, change_index=False): + """Append a "gene/ID" column to the dataframe, and use gene name+id(index) as values, usable or not as index""" # print(df) - if type_selected == "gene": + if what == "gene": if change_index is True: df.set_index('gene_short_name', inplace=True) return df - elif type_selected == "isoform": + elif what == "isoform": df["gene/ID"] = df['gene_short_name'].map(str) + " " + df.index if change_index is True: df.set_index("gene/ID", inplace=True) @@ -669,24 +649,22 @@ def _fusion_gene_id(df, type_selected, change_index=False): return df def onlyFPKM(self, return_as, **option): - """It uses self.selected or an extra_df and Return only FPKM columns. + """Take a Papillon dataframe and a list of samples, return only FPKM columns. return as: "df" - pandas DataFrame "array" - numpy array - "gene name" - pandas DataFrame containing gene names - - **option accept extra_df as exernal Pandas df""" - self.selected_exist() - df = self.selected.copy() + "gene name" - pandas DataFrame containing gene names""" + df=self.pp.selected if isinstance(option.get("extra_df"), pd.DataFrame): df = option.get("extra_df") + samples=self.pp.samples if return_as == "df": - df = df.loc[:, _FPKM(self.samples)] + df = df.loc[:, _FPKM(samples)] elif return_as == "array": - df = df.loc[:, _FPKM(self.samples)].values + df = df.loc[:, _FPKM(samples)].values elif return_as == "gene name": - columns = ['gene_short_name'] + _FPKM(self.samples) + columns = ['gene_short_name'] + _FPKM(samples) df = df.loc[:, columns] else: raise Exception( @@ -694,12 +672,16 @@ def onlyFPKM(self, return_as, **option): if option.get("remove_FPKM_name") is True: mydic = {} - n = len(self.samples) + n = len(self.pp.samples) while n != 0: n -= 1 - mydic[_FPKM(self.samples[n])] = self.samples[n] - df.rename(columns=mydic, inplace=True) + mydic[_FPKM(self.pp.samples[n])] = self.pp.samples[n] + df.rename(columns=mydic, inplace=True) return df + + def plot(**parameter): + """Deprecated. Use self.lineplot()""" + warnings.warn('self.plot() is deprecated. Use self.lineplot() instead.', DeprecationWarning) def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seismic", export=False, **options): """Generate heatmap using selected genes/isoforms @@ -712,19 +694,15 @@ def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seis **options - all the options accepted by seaborn.clustermap default metric is euclidean. """ - if len(self.samples) > 10: + if len(self.pp.samples) > 10: raise Exception("High-dimensional data. Ronan et al., 2016") - - self.selected_exist() - - print("Number of genes", len(self.selected)) - if len(self.selected) == 0: + print("Number of genes", len(self.pp.selected)) + if len(self.pp.selected) == 0: return - df_heatmap = self.onlyFPKM( - return_as="gene name", remove_FPKM_name=True) + df_heatmap = self.onlyFPKM(return_as="gene name", remove_FPKM_name=True) df_heatmap = self._fusion_gene_id( - df_heatmap, self.type_selected, change_index=True) + df_heatmap, self.pp.what, change_index=True) if z_score is True: z_score = 0 @@ -736,21 +714,19 @@ def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seis if len(df_heatmap) < 1000 and len(df_heatmap) > 25: big = sns.clustermap( df_heatmap, col_cluster=col_cluster, method=method, cmap=cmap, - z_score=z_score, figsize=((len(self.samples)), int(len(df_heatmap.index) / 4)), **options) + z_score=z_score, figsize=((len(self.pp.samples)), int(len(df_heatmap.index) / 4)), **options) self._export(big, name="big-heatmap", export=export) elif len(df_heatmap) > 1000: print("Too many genes for a big heatmap") @staticmethod def _z_score(df): - """ - Users should not use this function directly. - Z-score calculation - """ + """Z-score calculation""" # I could use z_score from scipy, but I don't want add scipy dependence too. # It would be: # from scipy.stats import zscore # zscore(a, axis=1, ddof=1) + print("Calculating Z Score...") df_mean = df.mean(axis="columns") df_std = df.std(axis="columns") @@ -758,13 +734,6 @@ def _z_score(df): df = df.div(df_std, axis="index") return df - def plot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): - "Use self.lineplot() instead. self.plot() will not work in the future" - warnings.warn( - 'Use self.lineplot() instead. self.plot() will not work in the future', DeprecationWarning) - self.lineplot(title="", legend=True, z_score=False, - export=False, df=None, size=10, ci=None, **option) - def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): """ LinePlot selected genes expression levels. Max number of genes 200 @@ -777,8 +746,7 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, **options - all the options accepted by seaborn.factorplot""" if df is None: - self.selected_exist() - df = self.selected.copy() + df = self.pp.selected.copy() if z_score is True: df_ = self.onlyFPKM( @@ -798,14 +766,14 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, print("Too many genes. Plot not shown") return - if self.type_selected == "gene": + if self.pp.what == "gene": hue = "gene_short_name" df_ = self._fusion_gene_id( - df_, self.type_selected, change_index=False) - elif self.type_selected == "isoform": + df_, self.pp.what, change_index=False) + elif self.pp.what == "isoform": hue = "gene/ID" # Change this hue name df_ = self._fusion_gene_id( - df_, self.type_selected, change_index=True) + df_, self.pp.what, change_index=True) df_ = df_.reset_index() df = pd.melt(df_, id_vars=[hue], var_name="Sample", value_name="FPKM") g = sns.factorplot(x="Sample", y="FPKM", hue=hue, @@ -813,23 +781,14 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, g.fig.suptitle(title) self._export(g, export=export, name="Plot") return g - - def __import_excel(self, filename, type_selected): - """Only for testing. Users should not use this function directly""" - if type_selected not in ["gene", "isoform"]: - raise Exception("type_selected can be only 'gene' or 'isoform'") - self.type_selected = type_selected - filename = str(self.path + filename) - self.selected = pd.read_excel(filename, index_col=0) - try: - del self.selected["duplicate"] - except KeyError: - pass - if __name__ == "__main__": - print( - self.selected.index, - self.selected.columns, - self.selected.head(), - len(self.selected.columns) - ) - + + def _export(self, thing, export, name=None, image_extension=".png"): # add .pdf? + """Manage dataframe or image export parameter.""" + if export is False: + return + elif export is True: + _make_folder(self.pp.path) + thing.savefig(str(self.pp.path + name + image_extension)) + print("\nExported as " + name + image_extension) + else: + raise Exception("export= can be only 'False' or 'True'") \ No newline at end of file diff --git a/setup.py b/setup.py index e996954..5357f01 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='papillon', - version='0.1.1', + version='0.2.0', py_modules=['papillon'], description='A Python module to read and plot (cuffdiff) Galaxy RNA-seq data', author='Domenico Somma', diff --git a/test/test_papillon.py b/test/test_papillon.py index 7696aee..51a7049 100644 --- a/test/test_papillon.py +++ b/test/test_papillon.py @@ -3,9 +3,6 @@ @author: domenico.somma@glasgow.ac.uk """ -# TO DO test exception -# TO DO test export - import os import matplotlib as mpl if os.environ.get('DISPLAY','') == '': @@ -39,8 +36,6 @@ def test_different_read(self): pp.read_folder(path) pp.read_folder(path+"/galaxy") pp.read_files([path+"/gene_exp.diff",path+"/genes.fpkm_tracking",path+"/isoform_exp.diff",path+"/isoforms.fpkm_tracking"]) -# with self.assertWarns(DeprecationWarning): -# pp.read_db(path) def test_functions_FPKM(self): self.assertEqual(pp._FPKM("ciao"),"ciao_FPKM") @@ -76,143 +71,133 @@ def test_read_folder(self): d=len(test.isoforms_significant.columns) self.assertTrue(a==b and b==c and c==d and d==18) print_test=pp.read_folder(path) - printable="Samples: ['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4']\nComparison: ['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4']\nGenes Detected: 5\nGenes differential expressed: 3\nIsoform Detected: 28\nIsoform differential expressed: 5\nNone of the genes is selected" + printable="Samples: ['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4']\nComparison: ['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4']\nGenes Detected: 5\nGenes differential expressed: 3\nIsoform Detected: 28\nIsoform differential expressed: 5\n" print(print_test.__str__(),"\n",printable) self.assertTrue(print_test.__str__()==printable) del print_test - def test_selected_exist(self): - test2=pp.read_folder(path) - with self.assertRaises(Exception): - test2.selected_exist() - with self.assertRaises(Exception): - test2.selected_exist(remove="Wrong") - test2.get_gene() - self.assertTrue(test2.selected_exist()) - del test2 - def test_get_gene(self): - test.get_gene() - self.assertEqual(len(test.selected),3) - self.assertEqual(len(test.selected.columns),18) - - test.get_gene("IL17RC") - self.assertEqual(test.selected.index[0],"MSTRG.10454") - self.assertEqual(len(test.selected),1) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene() + self.assertEqual(len(sub.selected),3) + self.assertEqual(len(sub.selected.columns),18) + + sub=test.get_gene("IL17RC") + self.assertEqual(sub.selected.index[0],"MSTRG.10454") + self.assertEqual(len(sub.selected),1) + self.assertEqual(len(sub.selected.columns),18) - test.get_gene(["IL6","CCL15"]) - self.assertEqual(len(test.selected),2) - self.assertEqual(test.selected.index[0],"IL6") - self.assertEqual(test.selected.index[1],"CCL15-2") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"]) + self.assertEqual(len(sub.selected),2) + self.assertEqual(sub.selected.index[0],"IL6") + self.assertEqual(sub.selected.index[1],"CCL15-2") + self.assertEqual(len(sub.selected.columns),18) # Gene-Comparison Test - test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"IL6") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"IL6") + self.assertEqual(len(sub.selected.columns),18) - test.get_gene(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"CCL15-2") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"CCL15-2") + self.assertEqual(len(sub.selected.columns),18) - test.get_gene(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") - self.assertEqual(len(test.selected),0) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") + self.assertEqual(len(sub.selected),0) + self.assertEqual(len(sub.selected.columns),18) # Gene-Comparison-sign Test - test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") - self.assertEqual(len(test.selected),0) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") + self.assertEqual(len(sub.selected),0) + self.assertEqual(len(sub.selected.columns),18) - test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"IL6") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"IL6") + self.assertEqual(len(sub.selected.columns),18) - test.get_gene() - self.assertEqual(len(test.selected),3) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene() + self.assertEqual(len(sub.selected),3) + self.assertEqual(len(sub.selected.columns),18) with self.assertRaises(Exception): - test.get_gene(comparison="Sample 1_vs_Sample 2", sign="Wrong") + sub=test.get_gene(comparison="Sample 1_vs_Sample 2", sign="Wrong") with self.assertRaises(Exception): - test.get_gene(sign=">") + sub=test.get_gene(sign=">") with self.assertRaises(Exception): - test.get_gene(comparison="Wrong") + sub=test.get_gene(comparison="Wrong") def test_get_isoform(self): - test.get_isoform() - self.assertEqual(len(test.selected),5) - self.assertEqual(len(test.selected.columns),18) - - test.get_isoform("IL6") - self.assertEqual(test.selected.index[0],"NM_000600.3") - self.assertEqual(len(test.selected),1) - self.assertEqual(len(test.selected.columns),18) - - test.get_isoform("CCL15") - self.assertEqual(test.selected.index[0],"NM_032965.4") - self.assertEqual(test.selected.index[1],"NM_032965.4-2") - self.assertEqual(len(test.selected),2) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform() + self.assertEqual(len(sub.selected),5) + self.assertEqual(len(sub.selected.columns),18) + + sub=test.get_isoform("IL6") + self.assertEqual(sub.selected.index[0],"NM_000600.3") + self.assertEqual(len(sub.selected),1) + self.assertEqual(len(sub.selected.columns),18) + + sub=test.get_isoform("CCL15") + self.assertEqual(sub.selected.index[0],"NM_032965.4") + self.assertEqual(sub.selected.index[1],"NM_032965.4-2") + self.assertEqual(len(sub.selected),2) + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CD44"]) - self.assertEqual(len(test.selected),3) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CD44"]) + self.assertEqual(len(sub.selected),3) + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CCL15"]) - self.assertEqual(len(test.selected),3) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"]) + self.assertEqual(len(sub.selected),3) + self.assertEqual(len(sub.selected.columns),18) # Isoform-Comparison Test - test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"NM_000600.3") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"NM_000600.3") + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") - self.assertEqual(len(test.selected),2) - self.assertEqual(test.selected.index[0],"NM_032965.4") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") + self.assertEqual(len(sub.selected),2) + self.assertEqual(sub.selected.index[0],"NM_032965.4") + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 4") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"NM_032965.4-2") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 4") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"NM_032965.4-2") + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") - self.assertEqual(len(test.selected),0) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") + self.assertEqual(len(sub.selected),0) + self.assertEqual(len(sub.selected.columns),18) # Isoform-Comparison-sign Test - test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") - self.assertEqual(len(test.selected),0) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") + self.assertEqual(len(sub.selected),0) + self.assertEqual(len(sub.selected.columns),18) - test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") - self.assertEqual(len(test.selected),1) - self.assertEqual(test.selected.index[0],"NM_000600.3") - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") + self.assertEqual(len(sub.selected),1) + self.assertEqual(sub.selected.index[0],"NM_000600.3") + self.assertEqual(len(sub.selected.columns),18) # Final - test.get_isoform() - self.assertEqual(len(test.selected),5) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform() + self.assertEqual(len(sub.selected),5) + self.assertEqual(len(sub.selected.columns),18) def test_onlyFPKM(self): test=pp.read_folder(path) - test.get_isoform() - df=test.onlyFPKM("df") + sub=test.get_isoform() + df=sub.onlyFPKM("df") self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),5) self.assertEqual(len(df.columns),4) self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df.index[-1],"NM_032965.4-2") - df=test.onlyFPKM("gene name") + df=sub.onlyFPKM("gene name") self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),5) self.assertEqual(len(df.columns),5) @@ -221,27 +206,27 @@ def test_onlyFPKM(self): self.assertEqual(df.index[-1],"NM_032965.4-2") self.assertEqual(df["gene_short_name"][-1],"CCL15") - df=test.onlyFPKM("array") + df=sub.onlyFPKM("array") self.assertTrue(type(df)==numpy.ndarray) self.assertEqual(len(df),5) self.assertEqual(list(df[1]),[0.0, 3.0, 0.0, 0.0]) self.assertEqual(list(df[-1]),[0.0, 0.0, 0.0, 3.0]) - #making extra_df - test.get_isoform() - extra_df=test.selected.iloc[:4,2:6].T.copy() - extra_df=pd.DataFrame(data=extra_df.values, index=test.selected.index[:4], columns=test.selected.columns[2:6]) - extra_df['gene_short_name']=test.selected['gene_short_name'][:4] + # making extra_df + sub=test.get_isoform() + extra_df=sub.selected.iloc[:4,2:6].T.copy() + extra_df=pd.DataFrame(data=extra_df.values, index=sub.selected.index[:4], columns=sub.selected.columns[2:6]) + extra_df['gene_short_name']=sub.selected['gene_short_name'][:4] - #testing extra_df - df=test.onlyFPKM("df",extra_df=extra_df) + # testing extra_df + df=sub.onlyFPKM("df",extra_df=extra_df) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),4) self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df.index[-1],"NM_032965.4") - df=test.onlyFPKM("gene name",extra_df=extra_df) + df=sub.onlyFPKM("gene name",extra_df=extra_df) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),5) @@ -250,7 +235,7 @@ def test_onlyFPKM(self): self.assertEqual(df.index[-1],"NM_032965.4") self.assertEqual(df["gene_short_name"][-1],"CCL15") - df=test.onlyFPKM("array",extra_df=extra_df) + df=sub.onlyFPKM("array",extra_df=extra_df) self.assertTrue(type(df)==numpy.ndarray) self.assertEqual(len(df),4) self.assertEqual(list(df[0]),[0.0, 0.0, 4.0, 0.0]) @@ -258,36 +243,36 @@ def test_onlyFPKM(self): b=[0.016800, 0.0, 0.0, 0.0] self.assertAlmostEqual(a[0],b[0], places=0) - #testing remove_FPKM_name - df=test.onlyFPKM("df",extra_df=extra_df, remove_FPKM_name=True) + # testing remove_FPKM_name + df=sub.onlyFPKM("df",extra_df=extra_df, remove_FPKM_name=True) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),4) - self.assertEqual(list(df.columns),test.samples) + self.assertEqual(list(df.columns),sub.samples) self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df.index[-1],"NM_032965.4") - df=test.onlyFPKM("gene name",extra_df=extra_df, remove_FPKM_name=True) + df=sub.onlyFPKM("gene name",extra_df=extra_df, remove_FPKM_name=True) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),5) - self.assertEqual(list(df.columns[1:]),test.samples) + self.assertEqual(list(df.columns[1:]),sub.samples) self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df["gene_short_name"][0],"IL6") self.assertEqual(df.index[-1],"NM_032965.4") self.assertEqual(df["gene_short_name"][-1],"CCL15") # Final - test.get_isoform() - self.assertEqual(len(test.selected),5) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform() + self.assertEqual(len(sub.selected),5) + self.assertEqual(len(sub.selected.columns),18) def test_z_score(self): - test.get_isoform() - df=test.onlyFPKM("df") + sub=test.get_isoform() + df=sub.onlyFPKM("df") df1=zscore(df, axis=1, ddof=1) - df2=test._z_score(df) - df2=test.onlyFPKM("array",extra_df=df2) + df2=sub._z_score(df) + df2=sub.onlyFPKM("array",extra_df=df2) self.assertAlmostEqual(df1.all(), df2.all(), places=0) def test_search(self): @@ -335,20 +320,20 @@ def test_search(self): self.assertEqual(test.selected.index[0],"NM_000610.3") self.assertEqual(test.selected.index[-1],"NM_001001389.1") - test.get_isoform() - self.assertEqual(len(test.selected),5) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_isoform() + self.assertEqual(len(sub.selected),5) + self.assertEqual(len(sub.selected.columns),18) - test.get_gene() - self.assertEqual(len(test.selected),3) - self.assertEqual(len(test.selected.columns),18) + sub=test.get_gene() + self.assertEqual(len(sub.selected),3) + self.assertEqual(len(sub.selected.columns),18) def test_fusion_gene_id(self): - test.get_isoform() - m=len(test.selected.columns) - df=test.selected.copy() + sub=test.get_isoform() + m=len(sub.selected.columns) + df=sub.selected.copy() - df2=test._fusion_gene_id(df, test.type_selected, change_index=False) + df2=sub._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2["gene/ID"]),5) self.assertEqual(len(df2.columns),19) self.assertEqual(df2["gene/ID"][0],"IL6 NM_000600.3") @@ -358,27 +343,27 @@ def test_fusion_gene_id(self): self.assertEqual(df2["gene_short_name"][0],"IL6") self.assertEqual(df2["gene_short_name"][-1],"CCL15") - df2=test._fusion_gene_id(df, test.type_selected, change_index=True) + df2=sub._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),5) self.assertEqual(len(df2.columns),17) self.assertEqual(df2.index[0],"IL6 NM_000600.3") self.assertEqual(df2.index[-1],"CCL15 NM_032965.4-2") - test.get_gene() - df=test.selected.copy() - df2=test._fusion_gene_id(df, test.type_selected, change_index=False) + sub=test.get_gene() + df=sub.selected.copy() + df2=sub._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2),3) self.assertEqual(len(df2.columns),18) self.assertEqual(df2.index[1],"MSTRG.10454") self.assertEqual(df2.index[-1],"CCL15-2") - df2=test._fusion_gene_id(df, test.type_selected, change_index=True) + df2=sub._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),3) self.assertEqual(len(df2.columns),17) self.assertEqual(df2.index[0],"IL6") self.assertEqual(df2.index[-1],"CCL15") - n=len(test.selected.columns) + n=len(sub.selected.columns) self.assertTrue(m==n) def test_comparison(self): @@ -425,6 +410,7 @@ def multidrop(comp): del test2 def test_change_samples_order(self): + test=pp.read_folder(path) test.change_order(["Sample 4","Sample 3","Sample 2","Sample 1"]) samples_test=["Sample 4","Sample 3","Sample 2","Sample 1"] self.assertTrue(test.samples==samples_test) @@ -444,25 +430,32 @@ def test_change_samples_order(self): test.change_order(["Sample 4","Sample 3","Sample 2"]) with self.assertRaises(Exception): test.change_order(["Sample 4","Sample 3","Sample 2","Wrong"]) + + test=pp.read_folder(path) def test_plots(self): def plot_maker(type_sel,z_score): - + test=pp.read_folder(path) + if type_sel == "gene": + sub=test.get_gene() + elif type_sel == "isoform": + sub=test.get_isoform() + if z_score == True: - df_ = test.onlyFPKM(return_as="df",remove_FPKM_name=True) - df_norm = test._z_score(df_) - df_norm["gene_short_name"] = test.selected["gene_short_name"] + df_ = sub.onlyFPKM(return_as="df",remove_FPKM_name=True) + df_norm = sub._z_score(df_) + df_norm["gene_short_name"] = sub.selected["gene_short_name"] df_ = df_norm.copy() elif z_score==False: - df_ = test.onlyFPKM(return_as="gene name",remove_FPKM_name=True) + df_ = sub.onlyFPKM(return_as="gene name",remove_FPKM_name=True) if type_sel == "gene": hue = "gene_short_name" - df_ = test._fusion_gene_id(df_, type_sel, change_index=False) + df_ = sub._fusion_gene_id(df_, type_sel, change_index=False) elif type_sel == "isoform": hue = "gene/ID" - df_ = test._fusion_gene_id(df_, type_sel, change_index=True) + df_ = sub._fusion_gene_id(df_, type_sel, change_index=True) df_ = df_.reset_index() # df_ = test._fusion_gene_id(df_, type_sel, change_index=False) @@ -480,34 +473,36 @@ def image_check(): print(hash1,hash2) self.assertEqual(hash1,hash2) - test.get_gene() - -# with self.assertWarns(DeprecationWarning): -# test.plot() + sub=test.get_gene() plot_maker("gene",False) - test.lineplot(export=True) + sub.lineplot(export=True) image_check() plot_maker("gene",True) - test.lineplot(export=True,z_score=True) + sub.lineplot(export=True,z_score=True) image_check() - test.get_isoform() + sub=test.get_isoform() plot_maker("isoform",False) - test.lineplot(export=True) + sub.lineplot(export=True) image_check() plot_maker("isoform",True) - test.lineplot(export=True,z_score=True) + sub.lineplot(export=True,z_score=True) image_check() def test_heatmap(self): def heatmap_maker(z_score, type_sel): - df_heatmap = test.onlyFPKM(return_as="gene name",remove_FPKM_name=True) - df_heatmap = test._fusion_gene_id(df_heatmap, type_sel, change_index=True) + test=pp.read_folder(path) + if type_sel == "gene": + sub=test.get_gene() + elif type_sel == "isoform": + sub=test.get_isoform() + df_heatmap = sub.onlyFPKM(return_as="gene name",remove_FPKM_name=True) + df_heatmap = sub._fusion_gene_id(df_heatmap, type_sel, change_index=True) im1 = sns.clustermap(df_heatmap, col_cluster=False, method="complete", cmap="seismic", z_score=z_score) im1.savefig(str(test.path + "test.png")) @@ -519,24 +514,24 @@ def image_check(): print(hash1,hash2) self.assertEqual(hash1,hash2) - test.get_gene() + sub=test.get_gene() heatmap_maker(0,"gene") - test.heatmap(export=True) + sub.heatmap(export=True) image_check() heatmap_maker(None,"gene") - test.heatmap(z_score=False,export=True) + sub.heatmap(z_score=False,export=True) image_check() - test.get_isoform() + sub=test.get_isoform() heatmap_maker(0,"isoform") - test.heatmap(export=True) + sub.heatmap(export=True) image_check() heatmap_maker(None,"isoform") - test.heatmap(z_score=False,export=True) + sub.heatmap(z_score=False,export=True) image_check() if __name__ == '__main__': From 325bb3088b7fd875d8ad01ef152e4e9b8cc0a7b9 Mon Sep 17 00:00:00 2001 From: domenico-somma <34346930+domenico-somma@users.noreply.github.com> Date: Fri, 30 Mar 2018 11:24:52 +0100 Subject: [PATCH 2/8] Dev: improved classes More classes. Gene list as class on its own. Now is possible sum two listes togeder --- papillon.py | 483 ++++++++++++++++++++++++++---------------- test/test_papillon.py | 218 +++++++++---------- 2 files changed, 414 insertions(+), 287 deletions(-) diff --git a/papillon.py b/papillon.py index c8042b2..9e019c2 100644 --- a/papillon.py +++ b/papillon.py @@ -147,7 +147,7 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr col_sample1 = [] col_sample2 = [] for sample in samples: - if sample in list(isoform_diff["sample_1"]): + if sample in list(isoform_diff["sample_1"]): # TO DO - To change with tolist? col_sample1.append(sample) if sample in list(isoform_diff["sample_2"]): col_sample2.append(sample) @@ -162,14 +162,14 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr for sample1 in col_sample1: for sample2 in col_sample2: if sample1 != sample2: - if len(isoform_diff[(isoform_diff["sample_1"] == sample1) & (isoform_diff["sample_2"] == sample2)]) != 0: + if len(isoform_diff[(isoform_diff["sample_1"] == sample1) & (isoform_diff["sample_2"] == sample2)]) != 0: # TO DO - Not so clear, can be improved? comparison = _vs(sample1, sample2) if comparison in drop_comparison: n_left -= 1 elif comparison not in drop_comparison: comparisons.append(comparison) print(comparison) - else: + else: # TO DO - Really necessary? pass else: pass @@ -182,14 +182,8 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr return Papillon_db(path, samples, comparisons, genes_detect, genes_significant, isoforms_detect, isoforms_significant) - # Not working now - to fix to easy add features -# if __name__ == "__main__": -# class CummerbundTables: -# def __init__(self): -# self.isoform_fpkm -# self.isoform_diff -# self.gene_fpkm -# self.gene_diff +#def _read_folder_testing(): + #delegare e return papillon_db and cummerbund def _generate_df(what, samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons): @@ -226,8 +220,7 @@ def gene_or_isoform(what, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff): df_detected = df[TrueFalseMask.any(axis=1)] print("\n\tDetected ", what + "s: ", len(df_detected)) - df_significant = Papillon_db._significant( - df_detected, comparisons, what) + df_significant = _Manipulate_db._significant(df_detected, comparisons, what) return df_detected, df_significant @@ -274,12 +267,20 @@ def _obtain_list(genelist, path): # TO DO - eventually remove empty one return gene_list +#class _Cummerbund: +# def __init__(self, isoform_fpkm, isoform_diff, gene_fpkm, gene_diff): +# self.isoform_fpkm +# self.isoform_diff +# self.gene_fpkm +# self.gene_diff + + class Papillon_db: """Make a Papillon_db object and permit to change some values self.path - files path self.samples - samples found - self.comparison - comparisons found + self.comparisons - comparisons found self.genes_detect - dataframe of genes detected self.genes_significant - dataframe of genes significant self.isoforms_detect - dataframe of isoforms detected @@ -287,23 +288,23 @@ class Papillon_db: expressed redefine __str__""" - # TO DO - Add the function to export the Papillon_db (as table? as sqlite?) def __init__(self, path, samples, comparisons, genes_detected, genes_significant, isoforms_detected, isoform_significant): self.path = path self.samples = samples - self.comparison = comparisons + self.comparisons = comparisons self.genes_detect = genes_detected self.genes_significant = genes_significant self.isoforms_detect = isoforms_detected self.isoforms_significant = isoform_significant - self._compare() - print("\n...Done") + self.Manipulate = _Manipulate_db() + self.Manipulate._compare(self) + print("\n...Done") def __str__(self): a = "Samples: " + str(self.samples) + "\n" - b = "Comparison: " + str(self.comparison) + "\n" + b = "Comparison: " + str(self.comparisons) + "\n" c = "Genes Detected: " + str(len(self.genes_detect)) + "\n" d = "Genes differential expressed: " + \ str(len(self.genes_significant)) + "\n" @@ -312,21 +313,72 @@ def __str__(self): str(len(self.isoforms_significant)) + "\n" visual = a + b + c + d + e + f return visual + + def drop_comparison(self, comparison): + """Drop Comparison (str) or list of comparisons and re-calculate + df_significant - @staticmethod - def _significant(df_detected, comparison, what): - """Calculate significant expressed genes.""" - if what not in ["gene", "isoform"]: - raise Exception("what= not known") - df_significant = df_detected[ - df_detected.loc[:, comparison].any(axis=1)] - print("\n\tSignificant expressed ", what + "s: ", len(df_significant)) - return df_significant + comparison: comparison (str) or list of comparisons""" + self = self.Manipulate.dropComparison(self, comparison) + + def change_order(self, new_order): + """Change the samples order + + new_order: list of samples order""" + self = self.Manipulate.change_order(self, new_order) + + def get_gene(self, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fol_sign=None): + """This function select genes. It return a Papillon_list object + + genelist - accept string (1 gene name), list of gene names or file + with a list of gene names + comparison - accept only 1 comparison as str (already present in + the data) + sign - usable in combination with comparison, accept either ">" or + "<" + fold_ind - fold induction (log2) higher then number + """ + return self.Manipulate.get_gene(self, genelist, comparison, comparison_sign, fold_ind, fol_sign) + + def get_isoform(self, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fol_sign=None): + """This function select isoforms. It creates a Papillon object + + genelist - accept string (gene name), list of gene names or file + with a list of gene names + comparison - accept only 1 comparison as str (already present in + the data) + sign - usable in combination with comparison, accept either ">" or + "<" + fold_ind - fold induction (log2) higher then number""" + + return self.Manipulate.get_isoform(self, genelist, comparison, comparison_sign, fold_ind, fol_sign) - def _compare(self): + def search(self, word, where, how="table", export=False): + """search among genes/isoforms names in detected and significant + + word - accept a str to search among the gene names + where - accept: + "genes_detected" + "genes_significant" + "isoforms_detected" + "isoforms_significant" + + how - accept: + "table" return the dataframe with the genes found + "list" return a list of names, no duplicates + "selected" put the genes found among the differential expressed + genes in self.selected (to plot), + working only with where="significant" """ + return self.Manipulate.search(self, word, where, how, export) + +class _Manipulate_db: + # TO DO - Add the function to export the Papillon_db (as table? as sqlite?) + + @staticmethod + def _compare(pp): """Compare genes and isoforms significantly expressed""" - genes = list(self.genes_significant["gene_short_name"]) - isoforms = list(self.isoforms_significant["gene_short_name"]) + genes = list(pp.genes_significant["gene_short_name"]) + isoforms = list(pp.isoforms_significant["gene_short_name"]) genes_not_found = [] isoforms_not_found = [] for name in genes: @@ -353,7 +405,7 @@ def _compare(self): print(set(isoforms_not_found)) return genes_not_found, isoforms_not_found, n # Only for tests so far. - def dropComparison(self, comparison): + def dropComparison(self, pp, comparison): """Drop Comparison (str) or list of comparisons and re-calculate df_significant @@ -361,12 +413,12 @@ def dropComparison(self, comparison): """ def dropComp(comp): - if comp in self.comparison: - del self.isoforms_detect[comp] - del self.isoforms_detect[str("q-value_" + comp)] - del self.genes_detect[comp] - del self.genes_detect[str("q-value_" + comp)] - self.comparison.remove(comp) + if comp in pp.comparisons: + del pp.isoforms_detect[comp] + del pp.isoforms_detect[str("q-value_" + comp)] + del pp.genes_detect[comp] + del pp.genes_detect[str("q-value_" + comp)] + pp.comparisons.remove(comp) print(comp, " removed") else: raise Exception(comp, " not found, please double check it") @@ -377,55 +429,68 @@ def dropComp(comp): for comp in comparison: dropComp(comp) - self.genes_significant = self._significant( - self.genes_detect, self.comparison, "gene") - self.isoforms_significant = self._significant( - self.isoforms_detect, self.comparison, "isoform") - self._compare() + pp.genes_significant = self._significant( + pp.genes_detect, pp.comparisons, "gene") + pp.isoforms_significant = self._significant( + pp.isoforms_detect, pp.comparisons, "isoform") + self._compare(pp) print("...Done") + return pp - def change_order(self, new_order): + @staticmethod + def _significant(df_detected, comparison, what): + """Calculate significant expressed genes.""" + if what not in ["gene", "isoform"]: + raise Exception("what= not known") + df_significant = df_detected[ + df_detected.loc[:, comparison].any(axis=1)] + print("\n\tSignificant expressed ", what + "s: ", len(df_significant)) + return df_significant + + @staticmethod + def change_order(pp, new_order): """Change the samples order new_order: list of samples order""" - n_sampl = len(self.samples) + n_sampl = len(pp.samples) if len(new_order) != n_sampl: raise Exception("Number of samples doesn't match") for sample in new_order: - if sample not in self.samples: + if sample not in pp.samples: raise Exception(sample, "Sample not known") - cols = self.genes_detect.columns.tolist() + cols = pp.genes_detect.columns.tolist() cols = cols[:2] + _FPKM(new_order) + cols[n_sampl + 2:] - self.samples = new_order - self.genes_detected = self.genes_detect[cols] - self.genes_significant = self.genes_significant[cols] - self.isoforms_detect = self.isoforms_detect[cols] - self.isoforms_significant = self.isoforms_significant[cols] - - def _select(self, genelist, what, comparison, sign): + pp.samples = new_order + pp.genes_detected = pp.genes_detect[cols] + pp.genes_significant = pp.genes_significant[cols] + pp.isoforms_detect = pp.isoforms_detect[cols] + pp.isoforms_significant = pp.isoforms_significant[cols] + return pp + + @staticmethod + def _select(pp, genelist, what):#, comparison, sign): """Part of get_gene/get_isoform function""" if what != "gene" and what != "isoform": raise Exception("Only what=gene or what=isoform admitted") - gene_list = _obtain_list(genelist, path=self.path) + gene_list = _obtain_list(genelist, path=pp.path) if what == "gene": - df = pd.DataFrame.copy(self.genes_significant) + df = pd.DataFrame.copy(pp.genes_significant) elif what == "isoform": - df = pd.DataFrame.copy(self.isoforms_significant) - + df = pd.DataFrame.copy(pp.isoforms_significant) n=0 if gene_list != []: df["Selected"] = [True if name in gene_list else False for name in df["gene_short_name"]] df = df[df["Selected"] == True].iloc[:, :-1] for name in gene_list: if name not in list(df["gene_short_name"]): - print("Gene name not found:\t", name) + print("Gene name not found:\t", name) # return not found list n+=1 print("Number of gene not found: ",n) return df - def search(self, word, where, how="table", export=False): + def search(self, pp ,word, where, how="table", export=False): """search among genes/isoforms names in detected and significant word - accept a str to search among the gene names @@ -437,10 +502,10 @@ def search(self, word, where, how="table", export=False): how - accept: "table" return the dataframe with the genes found - "list" return a list of names, no duplicates - "selected" put the genes found among the differential expressed - genes in self.selected (to plot), - working only with where="significant" """ + "list" return a list of names, no duplicates""" +# "selected" put the genes found among the differential expressed +# genes in self.selected (to plot), +# working only with where="significant" """ def df_or_list(df_, how_): if how_ == "table": @@ -459,14 +524,14 @@ def df_or_list(df_, how_): pass word1, word2 = where.split("_") if word1 == "genes": - df = self.genes_detect[self.genes_detect["gene_short_name"].str.contains( + df = pp.genes_detect[pp.genes_detect["gene_short_name"].str.contains( word)] - df_sig = self.genes_significant[self.genes_significant["gene_short_name"].str.contains( + df_sig = pp.genes_significant[pp.genes_significant["gene_short_name"].str.contains( word)] elif word1 == "isoforms": - df = self.isoforms_detect[self.isoforms_detect["gene_short_name"].str.contains( + df = pp.isoforms_detect[pp.isoforms_detect["gene_short_name"].str.contains( word)] - df_sig = self.isoforms_significant[self.isoforms_significant["gene_short_name"].str.contains( + df_sig = pp.isoforms_significant[pp.isoforms_significant["gene_short_name"].str.contains( word)] if len(df) == 0: @@ -495,9 +560,10 @@ def df_or_list(df_, how_): found = df_or_list(df_sig, how) self._export(found, export=export, name="search_result") + print(found) return found - def get_gene(self, genelist=None, comparison=None, sign=None): + def get_gene(self, pp, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=None): """This function select genes. It creates a Papillon object genelist - accept string (1 gene name), list of gene names or file @@ -507,11 +573,11 @@ def get_gene(self, genelist=None, comparison=None, sign=None): sign - usable in combination with comparison, accept either ">" or "<" """ - df = self._select(genelist, "gene", comparison, sign) - return Papillon(df, genelist, "gene", comparison, sign, self.path, self.samples) + df = self._select(pp, genelist, "gene")#, comparison, comparison_sign) # Why I have done this pre-selection? + return Papillon_list(df, "gene", pp.comparisons, pp.path, pp.samples ,comparison, comparison_sign, fold_sign, fold_ind) # To do - Return number genes not found - def get_isoform(self, genelist=None, comparison=None, sign=None): + def get_isoform(self, pp, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=None): """This function select isoforms. It creates a Papillon object genelist - accept string (gene name), list of gene names or file @@ -522,8 +588,8 @@ def get_isoform(self, genelist=None, comparison=None, sign=None): "<" export - True/False whether want or not export the dataframe of selected genes""" - df = self._select(genelist, "isoform", comparison, sign) - return Papillon(df, genelist, "isoform", comparison, sign, self.path, self.samples) + df = self._select(pp, genelist, "isoform")#, comparison, comparison_sign) # Why I have done this pre-selection? + return Papillon_list(df, "isoform", pp.comparisons, pp.path, pp.samples, comparison, comparison_sign, fold_sign, fold_ind) def _export(self, thing, export, name=None): """Manage dataframe or image export parameter.""" @@ -536,53 +602,155 @@ def _export(self, thing, export, name=None): else: raise Exception("export= can be only 'False' or 'True'") - -class Papillon: - def __init__(self, df, genelist, what, comparison, sign, path, samples): + +class Papillon_list: + def __init__(self, df, what, comparisons, path, samples, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=">"): self.df=df - self.selected=df.copy() - self.genelist=genelist self.what=what - self.comparison=comparison - self.sign=sign self.path=path self.samples=samples - self._sub_select(comparison, sign) +# self.genelist=genelist + if comparison is None: + if comparison_sign is not None: + raise Exception("Sign passed but not comparison") + else: + self.comparison=comparisons + elif comparison is not None: + if comparison not in comparisons: + raise Exception("Comparison not found") + else: + self.comparison=comparison + self.comparison_sign=comparison_sign + self.fold_ind=fold_ind + self.fold_sign=fold_sign + + self.plot=_Plot() + self.Manipulate = _Manipulate_list() + self.Manipulate._sub_select(self, comparison, comparison_sign, fold_ind, fold_sign) + + def __str__(self): + a = "Number of "+ self.what + " selected: "+ str(len(self.df)) + "\n" + b = "Samples: " + str(self.samples) + "\n" + visual = a + b + if self.comparison_sign is not None: + w1,w2=_vs(self.comparison) + n = "Comparison selected: " + w1 + self.comparison_sign + w2 + "\n" + else: + n = "Comparison selected: " + str(self.comparison) + "\n" + visual = visual + n + if self.fold_ind is not None: + n = "Fold induction log2" + self.comparison_sign + str(self.fold_ind) + "\n" + visual = visual + n + self.show() + return visual + + def __add__(self, other): + if self.what != other.what: + raise Exception("Impossible, one is gene, the other isoform") + elif self.samples != other.samples or self.comparison != other.comparison or self.path != other.path: + raise Exception("The two elements seems to have different origins") + df= pd.merge(self.df, other.df, how='outer') + return Papillon_list(df, what=self.what, comparisons=self.comparison, comparison=None, path=self.path, samples=self.samples) + + def __radd__(self, other): + if other == 0: + return self + else: + return self.__add__(other) # sum([T1, T2, T3]) + + def show(self): + self.Manipulate.show(self) + +# def __getattr__(self, arg): # TO DO - Add signle functions with descriptions +# _plot=_Plot() +# setattr(_plot, "pp", self) +# return getattr(_plot, arg) + + def onlyFPKM(self, return_as, remove_FPKM_name=False): + """Take a Papillon dataframe and a list of samples, return only FPKM columns. - def _sub_select(self, comparison, sign): + return as: + "df" - pandas DataFrame + "array" - numpy array + "gene name" - pandas DataFrame containing gene names""" + return self.plot.onlyFPKM(self.df, self.samples, return_as, remove_FPKM_name) + + def plot(**parameter): + """Deprecated. Use self.lineplot()""" + warnings.warn('self.plot() is deprecated. Use self.lineplot() instead.', DeprecationWarning) + + def heatmap(self, z_score=False, col_cluster=False, method="complete", cmap="seismic", export=False, **options): + """Generate heatmap using selected genes/isoforms + z_score - True/False whether want or not apply z-score normalization + col_cluster - True/False whether want or not cluster the samples + method - clustering algorithm - default is complete-linkage + cmap - map color + export - True/False whether want or not export the dataframe of + selected genes + **options - all the options accepted by seaborn.clustermap + default metric is euclidean. + """ + self.plot.heatmap(self, z_score, col_cluster, method, cmap, export, **options) + + def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): + """ + LinePlot selected genes expression levels. Max number of genes 200 + + title - accept a str as title of the plot + legend - True/False show the legend + z_score - True/False calculate the z-score normalization + export - True/False whether or not export the image + df - accept an exernal dataframe, different from self.selected + **options - all the options accepted by seaborn.factorplot""" + self.plot.lineplot(self, title, legend, z_score, export, df, size, ci, **option) + + +class _Manipulate_list: + def _sub_select(self, pp, comparison, comparison_sign, fold_ind, fold_sign): """ Part of get_gene/get_isoform function""" ACCEPTED_SIGN = [">", "<", None] - if comparison is None: - if sign is None: - return - elif sign is not None: - raise Exception("Sign passed, but not comparison") - elif comparison is not None: - if sign not in ACCEPTED_SIGN: + if comparison_sign is not None: + if comparison_sign not in ACCEPTED_SIGN: raise Exception('Only ">" "<" usable.') - if comparison not in self.comparison: - raise Exception("Comparison not found") - self.selected = self.selected[self.selected[comparison] == True] - sample1, sample2 = _vs(comparison) - if sign == ">": - self.selected = self.selected[self.selected[ - _FPKM(sample1)] > self.selected[_FPKM(sample2)]] - elif sign == "<": - self.selected = self.selected[self.selected[_FPKM( - sample1)] < self.selected[_FPKM(sample2)]] - print("\nNumber of ", self.what," selected: ", len(self.selected)) + if comparison is None: + raise Exception("Comparison_sign passed, but not comparison") - def __str__(self): - a = "Samples: " + str(self.samples) + "\n" - b = "Comparison: " + str(self.comparison) + "\n" - c = "Number of "+ self.what + " selected: "+ str(len(self.df)) + "\n" - visual = a + b + c - return visual + selected = pp.df + if comparison is not None: + selected = selected[selected[comparison] == True] + sample1, sample2 = _vs(comparison) + if comparison_sign == ">": + selected = selected[selected[_FPKM(sample1)] > selected[_FPKM(sample2)]] + elif comparison_sign == "<": + selected = selected[selected[_FPKM(sample1)] < selected[_FPKM(sample2)]] + if fold_ind is not None: + fi=[str("fi_log2_")+comp for comp in self.comparison] + if fold_sign is ">": + TrueFalseMask1=selected[fi]>fold_ind# or [df[fi]<-fold_ind] + TrueFalseMask2=selected[fi]<-fold_ind + if fold_sign is "<": + TrueFalseMask1=selected[fi]-fold_ind + + TrueFalseMask1=TrueFalseMask1.any(axis=1) + TrueFalseMask2=TrueFalseMask2.any(axis=1) +# TrueFalseMask=pd.merge(TrueFalseMask1,TrueFalseMask2,how="left") + TrueFalseMask=selected.copy() + TrueFalseMask["A"]=TrueFalseMask1 + TrueFalseMask["B"]=TrueFalseMask2 + TrueFalseMask=TrueFalseMask.loc[:,["A","B"]] + TrueFalseMask=TrueFalseMask.any(axis=1) + selected=selected[TrueFalseMask] + pp.df=selected.copy() + + print("\nNumber of ", pp.what," selected: ", len(pp.df)) - - def _export(self, thing, export, name=None): + def show(self, pp): + print(pp.df) + + def _export(self, thing, export, name=None): # TO DO - Not working. To activate """Manage dataframe or image export parameter.""" if export is False: return @@ -592,42 +760,10 @@ def _export(self, thing, export, name=None): print("\nExported as " + name + ".xls\n") else: raise Exception("export= can be only 'False' or 'True'") - - def __getattr__(self, arg): - _plot=_Plot() - setattr(_plot, "pp", self) - return getattr(_plot, arg) - def swap_gene_isoform(): + def swap_gene_isoform(self): pass #TO DO - - def __add__(): - pass - #TO DO - - def __radd__(): - pass - #Maybe TO DO - - def __import_excel(self, filename, type_selected): - """Only for testing. Users should not use this function directly""" - if type_selected not in ["gene", "isoform"]: - raise Exception("type_selected can be only 'gene' or 'isoform'") - self.type_selected = type_selected - filename = str(self.path + filename) - self.selected = pd.read_excel(filename, index_col=0) - try: - del self.selected["duplicate"] - except KeyError: - pass - if __name__ == "__main__": - print( - self.selected.index, - self.selected.columns, - self.selected.head(), - len(self.selected.columns) - ) class _Plot: """ """ @@ -648,17 +784,13 @@ def _fusion_gene_id(df, what, change_index=False): del df['gene_short_name'] return df - def onlyFPKM(self, return_as, **option): + def onlyFPKM(self, df, samples, return_as, remove_FPKM_name=False): """Take a Papillon dataframe and a list of samples, return only FPKM columns. return as: "df" - pandas DataFrame "array" - numpy array "gene name" - pandas DataFrame containing gene names""" - df=self.pp.selected - if isinstance(option.get("extra_df"), pd.DataFrame): - df = option.get("extra_df") - samples=self.pp.samples if return_as == "df": df = df.loc[:, _FPKM(samples)] elif return_as == "array": @@ -670,20 +802,16 @@ def onlyFPKM(self, return_as, **option): raise Exception( "Return_as not known. Only 'df','array','gene name'") - if option.get("remove_FPKM_name") is True: + if remove_FPKM_name is True: mydic = {} - n = len(self.pp.samples) + n = len(samples) while n != 0: n -= 1 - mydic[_FPKM(self.pp.samples[n])] = self.pp.samples[n] + mydic[_FPKM(samples[n])] = samples[n] df.rename(columns=mydic, inplace=True) return df - - def plot(**parameter): - """Deprecated. Use self.lineplot()""" - warnings.warn('self.plot() is deprecated. Use self.lineplot() instead.', DeprecationWarning) - def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seismic", export=False, **options): + def heatmap(self, pp, z_score=True, col_cluster=False, method="complete", cmap="seismic", export=False, **options): """Generate heatmap using selected genes/isoforms z_score - True/False whether want or not apply z-score normalization col_cluster - True/False whether want or not cluster the samples @@ -694,15 +822,15 @@ def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seis **options - all the options accepted by seaborn.clustermap default metric is euclidean. """ - if len(self.pp.samples) > 10: + if len(pp.samples) > 10: raise Exception("High-dimensional data. Ronan et al., 2016") - print("Number of genes", len(self.pp.selected)) - if len(self.pp.selected) == 0: + print("Number of genes", len(pp.df)) + if len(pp.df) == 0: return - df_heatmap = self.onlyFPKM(return_as="gene name", remove_FPKM_name=True) + df_heatmap = self.onlyFPKM(pp.df, pp.samples, return_as="gene name", remove_FPKM_name=True) df_heatmap = self._fusion_gene_id( - df_heatmap, self.pp.what, change_index=True) + df_heatmap, pp.what, change_index=True) if z_score is True: z_score = 0 @@ -710,12 +838,12 @@ def heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seis z_score = None small = sns.clustermap( df_heatmap, col_cluster=col_cluster, method=method, cmap=cmap, z_score=z_score, **options) - self._export(small, name="small-heatmap", export=export) + self._export(small, pp.path, name="small-heatmap", export=export) if len(df_heatmap) < 1000 and len(df_heatmap) > 25: big = sns.clustermap( df_heatmap, col_cluster=col_cluster, method=method, cmap=cmap, - z_score=z_score, figsize=((len(self.pp.samples)), int(len(df_heatmap.index) / 4)), **options) - self._export(big, name="big-heatmap", export=export) + z_score=z_score, figsize=((len(pp.samples)), int(len(df_heatmap.index) / 4)), **options) + self._export(big, pp.path, name="big-heatmap", export=export) elif len(df_heatmap) > 1000: print("Too many genes for a big heatmap") @@ -734,7 +862,7 @@ def _z_score(df): df = df.div(df_std, axis="index") return df - def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): + def lineplot(self, pp, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): """ LinePlot selected genes expression levels. Max number of genes 200 @@ -746,17 +874,16 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, **options - all the options accepted by seaborn.factorplot""" if df is None: - df = self.pp.selected.copy() + df = pp.df.copy() + samples=pp.samples if z_score is True: - df_ = self.onlyFPKM( - extra_df=df, return_as="df", remove_FPKM_name=True) + df_ = self.onlyFPKM(df, samples, return_as="df", remove_FPKM_name=True) df_norm = self._z_score(df_) df_norm["gene_short_name"] = df["gene_short_name"] df_ = df_norm.copy() elif z_score is False: - df_ = self.onlyFPKM( - extra_df=df, return_as="gene name", remove_FPKM_name=True) + df_ = self.onlyFPKM(df, samples, return_as="gene name", remove_FPKM_name=True) print("Number of genes to plot: ", len(df_)) if len(df_) > 50 and len(df_) < 200: @@ -766,29 +893,29 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, print("Too many genes. Plot not shown") return - if self.pp.what == "gene": + if pp.what == "gene": hue = "gene_short_name" df_ = self._fusion_gene_id( - df_, self.pp.what, change_index=False) - elif self.pp.what == "isoform": + df_, pp.what, change_index=False) + elif pp.what == "isoform": hue = "gene/ID" # Change this hue name df_ = self._fusion_gene_id( - df_, self.pp.what, change_index=True) + df_, pp.what, change_index=True) df_ = df_.reset_index() df = pd.melt(df_, id_vars=[hue], var_name="Sample", value_name="FPKM") g = sns.factorplot(x="Sample", y="FPKM", hue=hue, data=df, ci=ci, size=size, legend=legend, **option) g.fig.suptitle(title) - self._export(g, export=export, name="Plot") + self._export(g, pp.path, export=export, name="Plot") return g - def _export(self, thing, export, name=None, image_extension=".png"): # add .pdf? + def _export(self, thing, path, export, name=None, image_extension=".png"): # add .pdf? """Manage dataframe or image export parameter.""" if export is False: return elif export is True: - _make_folder(self.pp.path) - thing.savefig(str(self.pp.path + name + image_extension)) + _make_folder(path) + thing.savefig(str(path + name + image_extension)) print("\nExported as " + name + image_extension) else: - raise Exception("export= can be only 'False' or 'True'") \ No newline at end of file + raise Exception("export= can be only 'False' or 'True'") diff --git a/test/test_papillon.py b/test/test_papillon.py index 51a7049..aa99f19 100644 --- a/test/test_papillon.py +++ b/test/test_papillon.py @@ -60,7 +60,7 @@ def test_read_folder(self): comparison_test=['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4'] - self.assertTrue(test.comparison==comparison_test) + self.assertTrue(test.comparisons==comparison_test) self.assertEqual(len(test.genes_detect),5) self.assertEqual(len(test.genes_significant),3) self.assertEqual(len(test.isoforms_detect),28) @@ -78,51 +78,51 @@ def test_read_folder(self): def test_get_gene(self): sub=test.get_gene() - self.assertEqual(len(sub.selected),3) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene("IL17RC") - self.assertEqual(sub.selected.index[0],"MSTRG.10454") - self.assertEqual(len(sub.selected),1) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df),1) + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene(["IL6","CCL15"]) - self.assertEqual(len(sub.selected),2) - self.assertEqual(sub.selected.index[0],"IL6") - self.assertEqual(sub.selected.index[1],"CCL15-2") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(sub.df.index[1],"CCL15-2") + self.assertEqual(len(sub.df.columns),18) # Gene-Comparison Test sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"IL6") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"CCL15-2") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"CCL15-2") + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") - self.assertEqual(len(sub.selected),0) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),18) # Gene-Comparison-sign Test - sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") - self.assertEqual(len(sub.selected),0) - self.assertEqual(len(sub.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign=">") + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),18) - sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"IL6") - self.assertEqual(len(sub.selected.columns),18) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene() - self.assertEqual(len(sub.selected),3) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),18) with self.assertRaises(Exception): - sub=test.get_gene(comparison="Sample 1_vs_Sample 2", sign="Wrong") + sub=test.get_gene(comparison="Sample 1_vs_Sample 2", comparison_sign="Wrong") with self.assertRaises(Exception): sub=test.get_gene(sign=">") with self.assertRaises(Exception): @@ -130,62 +130,62 @@ def test_get_gene(self): def test_get_isoform(self): sub=test.get_isoform() - self.assertEqual(len(sub.selected),5) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform("IL6") - self.assertEqual(sub.selected.index[0],"NM_000600.3") - self.assertEqual(len(sub.selected),1) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(len(sub.df),1) + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform("CCL15") - self.assertEqual(sub.selected.index[0],"NM_032965.4") - self.assertEqual(sub.selected.index[1],"NM_032965.4-2") - self.assertEqual(len(sub.selected),2) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(sub.df.index[0],"NM_032965.4") + self.assertEqual(sub.df.index[1],"NM_032965.4-2") + self.assertEqual(len(sub.df),2) + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform(["IL6","CD44"]) - self.assertEqual(len(sub.selected),3) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform(["IL6","CCL15"]) - self.assertEqual(len(sub.selected),3) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),18) # Isoform-Comparison Test sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"NM_000600.3") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") - self.assertEqual(len(sub.selected),2) - self.assertEqual(sub.selected.index[0],"NM_032965.4") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[0],"NM_032965.4") + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 4") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"NM_032965.4-2") - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),18) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") - self.assertEqual(len(sub.selected),0) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),18) # Isoform-Comparison-sign Test - sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign=">") - self.assertEqual(len(sub.selected),0) - self.assertEqual(len(sub.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign=">") + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),18) - sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", sign="<") - self.assertEqual(len(sub.selected),1) - self.assertEqual(sub.selected.index[0],"NM_000600.3") - self.assertEqual(len(sub.selected.columns),18) + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(len(sub.df.columns),18) # Final sub=test.get_isoform() - self.assertEqual(len(sub.selected),5) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),18) def test_onlyFPKM(self): test=pp.read_folder(path) @@ -214,19 +214,19 @@ def test_onlyFPKM(self): # making extra_df sub=test.get_isoform() - extra_df=sub.selected.iloc[:4,2:6].T.copy() - extra_df=pd.DataFrame(data=extra_df.values, index=sub.selected.index[:4], columns=sub.selected.columns[2:6]) - extra_df['gene_short_name']=sub.selected['gene_short_name'][:4] + extra_df=sub.df.iloc[:4,2:6].T.copy() + extra_df=pd.DataFrame(data=extra_df.values, index=sub.df.index[:4], columns=sub.df.columns[2:6]) + extra_df['gene_short_name']=sub.df['gene_short_name'][:4] # testing extra_df - df=sub.onlyFPKM("df",extra_df=extra_df) + df=sub.plot.onlyFPKM(extra_df,sub.samples,"df") self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),4) self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df.index[-1],"NM_032965.4") - df=sub.onlyFPKM("gene name",extra_df=extra_df) + df=sub.plot.onlyFPKM(extra_df,sub.samples,"gene name") self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),5) @@ -235,7 +235,7 @@ def test_onlyFPKM(self): self.assertEqual(df.index[-1],"NM_032965.4") self.assertEqual(df["gene_short_name"][-1],"CCL15") - df=sub.onlyFPKM("array",extra_df=extra_df) + df=sub.plot.onlyFPKM(extra_df,sub.samples,"array") self.assertTrue(type(df)==numpy.ndarray) self.assertEqual(len(df),4) self.assertEqual(list(df[0]),[0.0, 0.0, 4.0, 0.0]) @@ -244,7 +244,7 @@ def test_onlyFPKM(self): self.assertAlmostEqual(a[0],b[0], places=0) # testing remove_FPKM_name - df=sub.onlyFPKM("df",extra_df=extra_df, remove_FPKM_name=True) + df=sub.plot.onlyFPKM(extra_df,sub.samples,"df", remove_FPKM_name=True) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),4) @@ -252,7 +252,7 @@ def test_onlyFPKM(self): self.assertEqual(df.index[0],"NM_000600.3") self.assertEqual(df.index[-1],"NM_032965.4") - df=sub.onlyFPKM("gene name",extra_df=extra_df, remove_FPKM_name=True) + df=sub.plot.onlyFPKM(extra_df,sub.samples,"gene name", remove_FPKM_name=True) self.assertTrue(type(df)==pd.DataFrame) self.assertEqual(len(df),4) self.assertEqual(len(df.columns),5) @@ -264,16 +264,16 @@ def test_onlyFPKM(self): # Final sub=test.get_isoform() - self.assertEqual(len(sub.selected),5) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),18) def test_z_score(self): sub=test.get_isoform() df=sub.onlyFPKM("df") df1=zscore(df, axis=1, ddof=1) - df2=sub._z_score(df) - df2=sub.onlyFPKM("array",extra_df=df2) - self.assertAlmostEqual(df1.all(), df2.all(), places=0) + df2=sub.plot._z_score(df) + df2=sub.plot.onlyFPKM(df2,sub.samples,"array") + self.assertTrue(numpy.allclose(df1, df2)) def test_search(self): search_result=test.search(word="IL6",where="genes_detected", how="list") @@ -309,31 +309,31 @@ def test_search(self): self.assertEqual(len(search_result.columns),18) self.assertEqual(search_result.index[0],"IL6") - search_result=test.search(word="CCL15",where="genes_significant", how="selected") - self.assertEqual(len(test.selected),1) - self.assertEqual(len(test.selected.columns),18) - self.assertEqual(test.selected.index[0],"CCL15-2") +# search_result=test.search(word="CCL15",where="genes_significant", how="selected") +# self.assertEqual(len(test.df),1) +# self.assertEqual(len(test.df.columns),18) +# self.assertEqual(test.df.index[0],"CCL15-2") - search_result=test.search(word="CD44",where="isoforms_significant", how="selected") - self.assertEqual(len(test.selected),2) - self.assertEqual(len(test.selected.columns),18) - self.assertEqual(test.selected.index[0],"NM_000610.3") - self.assertEqual(test.selected.index[-1],"NM_001001389.1") +# search_result=test.search(word="CD44",where="isoforms_significant", how="selected") +# self.assertEqual(len(test.df),2) +# self.assertEqual(len(test.df.columns),18) +# self.assertEqual(test.df.index[0],"NM_000610.3") +# self.assertEqual(test.df.index[-1],"NM_001001389.1") sub=test.get_isoform() - self.assertEqual(len(sub.selected),5) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),18) sub=test.get_gene() - self.assertEqual(len(sub.selected),3) - self.assertEqual(len(sub.selected.columns),18) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),18) def test_fusion_gene_id(self): sub=test.get_isoform() - m=len(sub.selected.columns) - df=sub.selected.copy() + m=len(sub.df.columns) + df=sub.df.copy() - df2=sub._fusion_gene_id(df, sub.what, change_index=False) + df2=sub.plot._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2["gene/ID"]),5) self.assertEqual(len(df2.columns),19) self.assertEqual(df2["gene/ID"][0],"IL6 NM_000600.3") @@ -343,31 +343,31 @@ def test_fusion_gene_id(self): self.assertEqual(df2["gene_short_name"][0],"IL6") self.assertEqual(df2["gene_short_name"][-1],"CCL15") - df2=sub._fusion_gene_id(df, sub.what, change_index=True) + df2=sub.plot._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),5) self.assertEqual(len(df2.columns),17) self.assertEqual(df2.index[0],"IL6 NM_000600.3") self.assertEqual(df2.index[-1],"CCL15 NM_032965.4-2") sub=test.get_gene() - df=sub.selected.copy() - df2=sub._fusion_gene_id(df, sub.what, change_index=False) + df=sub.df.copy() + df2=sub.plot._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2),3) self.assertEqual(len(df2.columns),18) self.assertEqual(df2.index[1],"MSTRG.10454") self.assertEqual(df2.index[-1],"CCL15-2") - df2=sub._fusion_gene_id(df, sub.what, change_index=True) + df2=sub.plot._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),3) self.assertEqual(len(df2.columns),17) self.assertEqual(df2.index[0],"IL6") self.assertEqual(df2.index[-1],"CCL15") - n=len(sub.selected.columns) + n=len(sub.df.columns) self.assertTrue(m==n) def test_comparison(self): - a,b,c=test._compare() + a,b,c=test.Manipulate._compare(test) self.assertEqual(a,{'IL17RC'}) self.assertEqual(b,{'CD44'}) self.assertEqual(c,2) @@ -376,7 +376,7 @@ def test_drop_comparison(self): def drop(comp): test2=pp.read_folder(path,drop_comparison=comp) test3=pp.read_folder(path) - test3.dropComparison(comp) + test3.drop_comparison(comp) df1=test2.genes_significant.all() df2=test3.genes_significant.all() self.assertTrue(df1.all()==df2.all()) @@ -384,9 +384,9 @@ def drop(comp): def multidrop(comp): test2=pp.read_folder(path) test3=pp.read_folder(path) - test2.dropComparison(comp) + test2.drop_comparison(comp) for c in comp: - test3.dropComparison(c) + test3.drop_comparison(c) df1=test2.genes_significant.all() df2=test3.genes_significant.all() self.assertTrue(df1.all()==df2.all()) @@ -406,7 +406,7 @@ def multidrop(comp): test2=pp.read_folder(path) with self.assertRaises(Exception): - test2.dropComparison("Wrong") + test2.drop_comparison("Wrong") del test2 def test_change_samples_order(self): @@ -415,7 +415,7 @@ def test_change_samples_order(self): samples_test=["Sample 4","Sample 3","Sample 2","Sample 1"] self.assertTrue(test.samples==samples_test) comparison_test=['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4'] - self.assertTrue(test.comparison==comparison_test) + self.assertTrue(test.comparisons==comparison_test) self.assertEqual(len(test.genes_detect),5) self.assertEqual(len(test.genes_significant),3) self.assertEqual(len(test.isoforms_detect),28) @@ -444,21 +444,21 @@ def plot_maker(type_sel,z_score): if z_score == True: df_ = sub.onlyFPKM(return_as="df",remove_FPKM_name=True) - df_norm = sub._z_score(df_) - df_norm["gene_short_name"] = sub.selected["gene_short_name"] + df_norm = sub.plot._z_score(df_) + df_norm["gene_short_name"] = sub.df["gene_short_name"] df_ = df_norm.copy() elif z_score==False: df_ = sub.onlyFPKM(return_as="gene name",remove_FPKM_name=True) if type_sel == "gene": hue = "gene_short_name" - df_ = sub._fusion_gene_id(df_, type_sel, change_index=False) + df_ = sub.plot._fusion_gene_id(df_, type_sel, change_index=False) elif type_sel == "isoform": hue = "gene/ID" - df_ = sub._fusion_gene_id(df_, type_sel, change_index=True) + df_ = sub.plot._fusion_gene_id(df_, type_sel, change_index=True) df_ = df_.reset_index() -# df_ = test._fusion_gene_id(df_, type_sel, change_index=False) +# df_ = test.plot._fusion_gene_id(df_, type_sel, change_index=False) df = pd.melt(df_, id_vars=hue, var_name="Sample", value_name="FPKM") g = sns.factorplot(x="Sample", y="FPKM", hue=hue, @@ -502,7 +502,7 @@ def heatmap_maker(z_score, type_sel): elif type_sel == "isoform": sub=test.get_isoform() df_heatmap = sub.onlyFPKM(return_as="gene name",remove_FPKM_name=True) - df_heatmap = sub._fusion_gene_id(df_heatmap, type_sel, change_index=True) + df_heatmap = sub.plot._fusion_gene_id(df_heatmap, type_sel, change_index=True) im1 = sns.clustermap(df_heatmap, col_cluster=False, method="complete", cmap="seismic", z_score=z_score) im1.savefig(str(test.path + "test.png")) @@ -517,7 +517,7 @@ def image_check(): sub=test.get_gene() heatmap_maker(0,"gene") - sub.heatmap(export=True) + sub.heatmap(z_score=True,export=True) image_check() heatmap_maker(None,"gene") @@ -527,7 +527,7 @@ def image_check(): sub=test.get_isoform() heatmap_maker(0,"isoform") - sub.heatmap(export=True) + sub.heatmap(z_score=True,export=True) image_check() heatmap_maker(None,"isoform") From 12e796c891c4d7739b0bcca49be2f2713c0da7bc Mon Sep 17 00:00:00 2001 From: domenico-somma <34346930+domenico-somma@users.noreply.github.com> Date: Sat, 16 Jun 2018 11:27:40 +0100 Subject: [PATCH 3/8] Ver 0.2.0a Now you can keep the genes/isoforms subselection in a variable Add or compare two subselections You can select either gene/isoform significant expressed for at least one condition or not significant at all. Plot gene/isoform significant for at least one condition or not significant with continuous and dashed line. self.read_db() deprecated self.plot() deprecated Add check if papillon folder is removed --- .cache/v/cache/lastfailed | 4 + CHANGE.txt | 5 +- README.md | 3 +- papillon.py | 515 +++++++++++++++++-------------- test/Test_files/isoform_exp.diff | 12 +- test/test_papillon.py | 351 +++++++++++++++++---- 6 files changed, 589 insertions(+), 301 deletions(-) create mode 100644 .cache/v/cache/lastfailed diff --git a/.cache/v/cache/lastfailed b/.cache/v/cache/lastfailed new file mode 100644 index 0000000..ed0e462 --- /dev/null +++ b/.cache/v/cache/lastfailed @@ -0,0 +1,4 @@ +{ + "test/test_papillon.py::papillon_Test::test_read_folder": true, + "test/test_papillon.py::papillon_Test::test_selected_exist": true +} \ No newline at end of file diff --git a/CHANGE.txt b/CHANGE.txt index f1c361b..d6fd82a 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,5 +1,8 @@ v 0.2.0, -18 -- Major changes: -Now you can keep the genes subselection in a variable +Now you can keep the genes/isoforms subselection in a variable +Add or compare two subselections +You can select either gene/isoform significant expressed for at least one condition or not significant at all. +Plot gene/isoform significant for at least one condition or not significant with continuous and dashed line. self.read_db() deprecated self.plot() deprecated Add check if papillon folder is removed diff --git a/README.md b/README.md index 6fe7e22..e6cdb10 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ You can use Papillon with any Scientific IDE, and install it using Pypi: or Anaconda with conda-forge channel: - conda install -c conda-forge papillon + conda config --add channels conda-forge + conda install papillon ## Usage diff --git a/papillon.py b/papillon.py index 9e019c2..646c9cf 100644 --- a/papillon.py +++ b/papillon.py @@ -27,8 +27,7 @@ def read_folder(path, drop_comparison=None): them to _papillon_builder(). path - accept a str with the folder path, containing the cuffdiff files - drop_comparison - drop comparison (str) or list of comparisons and - re-calculate significant genes/isoforms""" + drop_comparison - drop comparison (str) or list of comparisons""" if drop_comparison is None: drop_comparison = [] try: @@ -75,8 +74,7 @@ def read_files(files, path=None, drop_comparison=None): files - accept an iterable with the cuffdiff files path - where export Papillon generated files - drop_comparison - drop comparison (str) or list of comparisons and - re-calculate significant genes/isoforms""" + drop_comparison - drop comparison (str) or list of comparisons""" if drop_comparison is None: drop_comparison = [] @@ -96,10 +94,11 @@ def read_files(files, path=None, drop_comparison=None): gene_fpkm = file.copy() else: isoform_fpkm = file.copy() - + return _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, drop_comparison) def _make_folder(path): + """Create "Papillon" folder""" if not os.path.exists(path): os.makedirs(path) @@ -147,9 +146,9 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr col_sample1 = [] col_sample2 = [] for sample in samples: - if sample in list(isoform_diff["sample_1"]): # TO DO - To change with tolist? + if sample in isoform_diff["sample_1"].tolist(): col_sample1.append(sample) - if sample in list(isoform_diff["sample_2"]): + if sample in isoform_diff["sample_2"].tolist(): col_sample2.append(sample) # generate comparisons name list @@ -169,23 +168,17 @@ def _papillon_builder(isoform_fpkm, isoform_diff, gene_fpkm, gene_diff, path, dr elif comparison not in drop_comparison: comparisons.append(comparison) print(comparison) - else: # TO DO - Really necessary? - pass - else: - pass + if n_left != 0: raise Exception(drop_comparison, " not found") - genes_detect, genes_significant = _generate_df( - "gene", samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons) - isoforms_detect, isoforms_significant = _generate_df( - "isoform", samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons) + genes_detected = _generate_df("gene", samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons) + isoforms_detected = _generate_df("isoform", samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons) - return Papillon_db(path, samples, comparisons, genes_detect, genes_significant, isoforms_detect, isoforms_significant) + return Papillon_db(path, samples, comparisons, genes_detected, isoforms_detected)#, genes_significant, isoforms_detect, isoforms_significant) #def _read_folder_testing(): #delegare e return papillon_db and cummerbund - def _generate_df(what, samples, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff, comparisons): """Make dataframe for genes/isoforms detected and significant""" @@ -213,15 +206,17 @@ def gene_or_isoform(what, gene_fpkm, gene_diff, isoform_fpkm, isoform_diff): df[comparison] = [True if signif == "yes" else False for signif in df2["significant"]] df[str("q-value_" + comparison)] = df2["q_value"] + df[str("fi_log2_" + comparison)] = df2["log2(fold_change)"] m = 2 n = len(samples) + 2 TrueFalseMask = df.iloc[:, m:n] > 0 # with at least 1 value>0 - df_detected = df[TrueFalseMask.any(axis=1)] - print("\n\tDetected ", what + "s: ", len(df_detected)) - - df_significant = _Manipulate_db._significant(df_detected, comparisons, what) - return df_detected, df_significant + df = df[TrueFalseMask.any(axis=1)] + print("\n\tDetected ", what + "s: ", len(df)) + + df_significant = df[df.loc[:, comparisons].any(axis=1)] + print("\n\tSignificant expressed ", what + "s: ", len(df_significant)) + return df def _FPKM(name_list): @@ -266,7 +261,6 @@ def _obtain_list(genelist, path): # TO DO - eventually remove empty one pass return gene_list - #class _Cummerbund: # def __init__(self, isoform_fpkm, isoform_diff, gene_fpkm, gene_diff): # self.isoform_fpkm @@ -274,30 +268,23 @@ def _obtain_list(genelist, path): # TO DO - eventually remove empty one # self.gene_fpkm # self.gene_diff - class Papillon_db: """Make a Papillon_db object and permit to change some values self.path - files path self.samples - samples found self.comparisons - comparisons found - self.genes_detect - dataframe of genes detected - self.genes_significant - dataframe of genes significant - self.isoforms_detect - dataframe of isoforms detected - self.isoforms_significant - dataframe of isoforms significant - expressed + self.genes_detected - dataframe of genes detected + self.isoforms_detected - dataframe of isoforms detected redefine __str__""" - def __init__(self, path, samples, comparisons, genes_detected, genes_significant, isoforms_detected, isoform_significant): + def __init__(self, path, samples, comparisons, genes_detected, isoforms_detected):#, genes_significant, isoforms_detected, isoform_significant): self.path = path self.samples = samples self.comparisons = comparisons - self.genes_detect = genes_detected - self.genes_significant = genes_significant - self.isoforms_detect = isoforms_detected - self.isoforms_significant = isoform_significant - + self.genes_detected = genes_detected + self.isoforms_detected = isoforms_detected self.Manipulate = _Manipulate_db() self.Manipulate._compare(self) print("\n...Done") @@ -305,12 +292,11 @@ def __init__(self, path, samples, comparisons, genes_detected, genes_significant def __str__(self): a = "Samples: " + str(self.samples) + "\n" b = "Comparison: " + str(self.comparisons) + "\n" - c = "Genes Detected: " + str(len(self.genes_detect)) + "\n" - d = "Genes differential expressed: " + \ - str(len(self.genes_significant)) + "\n" - e = "Isoform Detected: " + str(len(self.isoforms_detect)) + "\n" - f = "Isoform differential expressed: " + \ - str(len(self.isoforms_significant)) + "\n" + c = "Genes Detected: " + str(len(self.genes_detected)) + "\n" + d = "Genes differential expressed: " + str(len(self.Manipulate.significant(self,"gene"))) + "\n" + e = "Isoform Detected: " + str(len(self.isoforms_detected)) + "\n" + f = "Isoform differential expressed: " + str(len(self.Manipulate.significant(self,"isoform"))) + "\n" + visual = a + b + c + d + e + f return visual @@ -327,7 +313,7 @@ def change_order(self, new_order): new_order: list of samples order""" self = self.Manipulate.change_order(self, new_order) - def get_gene(self, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fol_sign=None): + def get_gene(self, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): """This function select genes. It return a Papillon_list object genelist - accept string (1 gene name), list of gene names or file @@ -338,9 +324,12 @@ def get_gene(self, genelist=None, comparison=None, comparison_sign=None, fold_in "<" fold_ind - fold induction (log2) higher then number """ - return self.Manipulate.get_gene(self, genelist, comparison, comparison_sign, fold_ind, fol_sign) + try: + fold_ind=float(fold_ind) + except: pass + return self.Manipulate.get_gene(self, genelist=genelist, comparison=comparison, comparison_sign=comparison_sign, fold_ind=fold_ind, fold_sign=fold_sign) - def get_isoform(self, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fol_sign=None): + def get_isoform(self, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): """This function select isoforms. It creates a Papillon object genelist - accept string (gene name), list of gene names or file @@ -351,7 +340,10 @@ def get_isoform(self, genelist=None, comparison=None, comparison_sign=None, fold "<" fold_ind - fold induction (log2) higher then number""" - return self.Manipulate.get_isoform(self, genelist, comparison, comparison_sign, fold_ind, fol_sign) + try: + fold_ind=float(fold_ind) + except: pass + return self.Manipulate.get_isoform(self, genelist=genelist, comparison=comparison, comparison_sign=comparison_sign, fold_ind=fold_ind, fold_sign=fold_sign) def search(self, word, where, how="table", export=False): """search among genes/isoforms names in detected and significant @@ -365,20 +357,22 @@ def search(self, word, where, how="table", export=False): how - accept: "table" return the dataframe with the genes found - "list" return a list of names, no duplicates - "selected" put the genes found among the differential expressed - genes in self.selected (to plot), - working only with where="significant" """ + "list" return a list of names, no duplicates""" +# "selected" put the genes found among the differential expressed +# genes in self.selected (to plot), +# working only with where="significant" """ return self.Manipulate.search(self, word, where, how, export) class _Manipulate_db: + """Class containing the functions to manipulate Papillon_db""" # TO DO - Add the function to export the Papillon_db (as table? as sqlite?) - @staticmethod - def _compare(pp): + def _compare(self, pp): """Compare genes and isoforms significantly expressed""" - genes = list(pp.genes_significant["gene_short_name"]) - isoforms = list(pp.isoforms_significant["gene_short_name"]) + genes = self.significant(pp,"gene") + isoforms = self.significant(pp,"isoform") + genes = genes["gene_short_name"].tolist() + isoforms = isoforms["gene_short_name"].tolist() genes_not_found = [] isoforms_not_found = [] for name in genes: @@ -414,10 +408,8 @@ def dropComparison(self, pp, comparison): def dropComp(comp): if comp in pp.comparisons: - del pp.isoforms_detect[comp] - del pp.isoforms_detect[str("q-value_" + comp)] - del pp.genes_detect[comp] - del pp.genes_detect[str("q-value_" + comp)] + del pp.isoforms_detected[comp] + del pp.isoforms_detected[str("q-value_" + comp)] pp.comparisons.remove(comp) print(comp, " removed") else: @@ -428,23 +420,19 @@ def dropComp(comp): else: for comp in comparison: dropComp(comp) - - pp.genes_significant = self._significant( - pp.genes_detect, pp.comparisons, "gene") - pp.isoforms_significant = self._significant( - pp.isoforms_detect, pp.comparisons, "isoform") self._compare(pp) print("...Done") return pp @staticmethod - def _significant(df_detected, comparison, what): - """Calculate significant expressed genes.""" - if what not in ["gene", "isoform"]: + def significant(pp, what): + """Calculate significant expressed genes.""" + if what == "gene": + df_significant = pp.genes_detected[pp.genes_detected.loc[:, pp.comparisons].any(axis=1)] + elif what == "isoform": + df_significant = pp.isoforms_detected[pp.isoforms_detected.loc[:, pp.comparisons].any(axis=1)] + else: raise Exception("what= not known") - df_significant = df_detected[ - df_detected.loc[:, comparison].any(axis=1)] - print("\n\tSignificant expressed ", what + "s: ", len(df_significant)) return df_significant @staticmethod @@ -459,38 +447,36 @@ def change_order(pp, new_order): if sample not in pp.samples: raise Exception(sample, "Sample not known") - cols = pp.genes_detect.columns.tolist() + cols = pp.genes_detected.columns.tolist() cols = cols[:2] + _FPKM(new_order) + cols[n_sampl + 2:] pp.samples = new_order - pp.genes_detected = pp.genes_detect[cols] - pp.genes_significant = pp.genes_significant[cols] - pp.isoforms_detect = pp.isoforms_detect[cols] - pp.isoforms_significant = pp.isoforms_significant[cols] + pp.genes_detected = pp.genes_detected[cols] + pp.isoforms_detected = pp.isoforms_detected[cols] return pp @staticmethod - def _select(pp, genelist, what):#, comparison, sign): + def _select(pp, genelist, what): """Part of get_gene/get_isoform function""" if what != "gene" and what != "isoform": raise Exception("Only what=gene or what=isoform admitted") gene_list = _obtain_list(genelist, path=pp.path) if what == "gene": - df = pd.DataFrame.copy(pp.genes_significant) + df = pd.DataFrame.copy(pp.genes_detected) # QUi era significant, ora e' detected. Aggiungere controlli. Testare bene elif what == "isoform": - df = pd.DataFrame.copy(pp.isoforms_significant) + df = pd.DataFrame.copy(pp.isoforms_detected) n=0 if gene_list != []: df["Selected"] = [True if name in gene_list else False for name in df["gene_short_name"]] df = df[df["Selected"] == True].iloc[:, :-1] for name in gene_list: - if name not in list(df["gene_short_name"]): - print("Gene name not found:\t", name) # return not found list + if name not in df["gene_short_name"].tolist(): + print("Gene name not found:\t", name) # TO DO: return not found list ? n+=1 print("Number of gene not found: ",n) return df - def search(self, pp ,word, where, how="table", export=False): + def search(self, pp ,word, where, how="table", export=False): # TO DO - To fix """search among genes/isoforms names in detected and significant word - accept a str to search among the gene names @@ -515,7 +501,7 @@ def df_or_list(df_, how_): print(names) return names - # Checking input + # Checking input if where not in ["genes_detected", "genes_significant", "isoforms_detected", "isoforms_significant"]: raise Exception("where= not known") elif how not in ["table", "list", "selected"]: @@ -524,15 +510,15 @@ def df_or_list(df_, how_): pass word1, word2 = where.split("_") if word1 == "genes": - df = pp.genes_detect[pp.genes_detect["gene_short_name"].str.contains( - word)] - df_sig = pp.genes_significant[pp.genes_significant["gene_short_name"].str.contains( + df = pp.genes_detected[pp.genes_detected["gene_short_name"].str.contains( word)] + df_sig = self.significant(pp,"gene") + df_sig = df_sig[df_sig["gene_short_name"].str.contains(word)] elif word1 == "isoforms": - df = pp.isoforms_detect[pp.isoforms_detect["gene_short_name"].str.contains( - word)] - df_sig = pp.isoforms_significant[pp.isoforms_significant["gene_short_name"].str.contains( + df = pp.isoforms_detected[pp.isoforms_detected["gene_short_name"].str.contains( word)] + df_sig = self.significant(pp,"isoform") + df_sig = df_sig[df_sig["gene_short_name"].str.contains(word)] if len(df) == 0: print(word, " not found") @@ -559,11 +545,22 @@ def df_or_list(df_, how_): print("Differential expressed genes preview available") found = df_or_list(df_sig, how) - self._export(found, export=export, name="search_result") + if export is True: + _make_folder(pp.path) + try: + found.to_excel(str(pp.path + 'found.xls'), sheet_name='Sheet1') + print("\nExported as found.xls\n") + except: + file = open(pp.path + 'found.txt','w') + to_write="\n".join(found) + file.write(to_write) + file.close() + print("\nExported as found.txt\n") + print(found) return found - def get_gene(self, pp, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=None): + def get_gene(self, pp, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): """This function select genes. It creates a Papillon object genelist - accept string (1 gene name), list of gene names or file @@ -573,11 +570,14 @@ def get_gene(self, pp, genelist=None, comparison=None, comparison_sign=None, fol sign - usable in combination with comparison, accept either ">" or "<" """ - df = self._select(pp, genelist, "gene")#, comparison, comparison_sign) # Why I have done this pre-selection? - return Papillon_list(df, "gene", pp.comparisons, pp.path, pp.samples ,comparison, comparison_sign, fold_sign, fold_ind) - # To do - Return number genes not found - def get_isoform(self, pp, genelist=None, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=None): + if isinstance(genelist, Papillon_list): + genelist=set(genelist.df["gene_short_name"].tolist()) + df = self._select(pp, genelist, "gene") # To do - Return number genes not found? + return Papillon_list(df, "gene", pp.comparisons, pp.path, pp.samples , comparison, comparison_sign, fold_ind, fold_sign) + + + def get_isoform(self, pp, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): """This function select isoforms. It creates a Papillon object genelist - accept string (gene name), list of gene names or file @@ -588,52 +588,48 @@ def get_isoform(self, pp, genelist=None, comparison=None, comparison_sign=None, "<" export - True/False whether want or not export the dataframe of selected genes""" - df = self._select(pp, genelist, "isoform")#, comparison, comparison_sign) # Why I have done this pre-selection? - return Papillon_list(df, "isoform", pp.comparisons, pp.path, pp.samples, comparison, comparison_sign, fold_sign, fold_ind) + if isinstance(genelist, Papillon_list): + genelist=genelist.df["gene_short_name"].tolist() + df = self._select(pp, genelist, "isoform") + return Papillon_list(df, "isoform", pp.comparisons, pp.path, pp.samples, comparison, comparison_sign, fold_ind, fold_sign) - def _export(self, thing, export, name=None): - """Manage dataframe or image export parameter.""" - if export is False: - return - elif export is True: - _make_folder(self.path) - thing.to_excel(str(self.path + name + '.xls'), sheet_name='Sheet1') - print("\nExported as " + name + ".xls\n") - else: - raise Exception("export= can be only 'False' or 'True'") - +# def export(self, pp, name=None): TO DO +# if name is None: +# name="papillon_db" +# pp.genes_detected.to_excel("file_path.xls", sheet_name="Genes") +# pp.isoforms_detected = isoforms_detected + class Papillon_list: - def __init__(self, df, what, comparisons, path, samples, comparison=None, comparison_sign=None, fold_ind=None, fold_sign=">"): + """Class containing a selected list of genes, with data associated + + df should have: + [isoform_id(optional?)] [gene_id] [gene_name] [At least 2 readcounts - n] [at least 1 vs condition (n-1+n-2+n-3...ecc)] [at least 1 q_value(n-1+n-2+n-3...ecc)] #check if p_value>q_value + """ + def __init__(self, df, what, comparisons, path, samples, comparison="all" , comparison_sign=None, fold_ind=None, fold_sign=">",p=0.05): + self.Manipulate = _Manipulate_list() self.df=df - self.what=what - self.path=path + self.what=what #Change name in gene_or_isoform ? + self.path=path #necessary? Should be None as default? self.samples=samples -# self.genelist=genelist - if comparison is None: - if comparison_sign is not None: - raise Exception("Sign passed but not comparison") - else: - self.comparison=comparisons - elif comparison is not None: - if comparison not in comparisons: - raise Exception("Comparison not found") - else: - self.comparison=comparison self.comparison_sign=comparison_sign - self.fold_ind=fold_ind + self.fold_ind=fold_ind #change name in fc + if fold_ind is not None and fold_ind <= 0: + raise Exception('Fold_ind should be > 0') + if fold_sign not in [">", "<"]: #fc_sign + raise Exception(fold_sign,'Only ">" "<" usable.') self.fold_sign=fold_sign self.plot=_Plot() - self.Manipulate = _Manipulate_list() - self.Manipulate._sub_select(self, comparison, comparison_sign, fold_ind, fold_sign) + self.Manipulate._sub_select(self, comparisons, comparison) #exp_comparisons, keep_comparison def __str__(self): - a = "Number of "+ self.what + " selected: "+ str(len(self.df)) + "\n" - b = "Samples: " + str(self.samples) + "\n" - visual = a + b + a = "Type of selection: "+ self.what + "\n" + b = "Number of "+ self.what + " selected: "+ str(len(self.df)) + "\n" + c = "Samples: " + str(self.samples) + "\n" + visual = a + b + c if self.comparison_sign is not None: - w1,w2=_vs(self.comparison) + w1,w2=_vs(self.comparison[0]) n = "Comparison selected: " + w1 + self.comparison_sign + w2 + "\n" else: n = "Comparison selected: " + str(self.comparison) + "\n" @@ -641,14 +637,15 @@ def __str__(self): if self.fold_ind is not None: n = "Fold induction log2" + self.comparison_sign + str(self.fold_ind) + "\n" visual = visual + n - self.show() return visual def __add__(self, other): if self.what != other.what: raise Exception("Impossible, one is gene, the other isoform") - elif self.samples != other.samples or self.comparison != other.comparison or self.path != other.path: + elif self.samples != other.samples or self.path != other.path: raise Exception("The two elements seems to have different origins") + elif self.comparison != other.comparison: + raise Exception("Different comparisons between the Papillon_lists") df= pd.merge(self.df, other.df, how='outer') return Papillon_list(df, what=self.what, comparisons=self.comparison, comparison=None, path=self.path, samples=self.samples) @@ -656,15 +653,23 @@ def __radd__(self, other): if other == 0: return self else: - return self.__add__(other) # sum([T1, T2, T3]) + return self.__add__(other) # sum([T1, T2, T3]) - def show(self): + def show(self): #to_improve - list gene, df + """Show genes/isoforms as Dataframe """ self.Manipulate.show(self) - -# def __getattr__(self, arg): # TO DO - Add signle functions with descriptions -# _plot=_Plot() -# setattr(_plot, "pp", self) -# return getattr(_plot, arg) + +# def select(self): TO DO +# pass + #create another Papillon_list object + + def export(self): + """ Export the selected genes/isoforms as excel file.""" + self.Manipulate.export(self) + + def compare(self, other): + """ Compare two papillon objects""" + self.Manipulate.compare(self, other) def onlyFPKM(self, return_as, remove_FPKM_name=False): """Take a Papillon dataframe and a list of samples, return only FPKM columns. @@ -692,9 +697,8 @@ def heatmap(self, z_score=False, col_cluster=False, method="complete", cmap="sei """ self.plot.heatmap(self, z_score, col_cluster, method, cmap, export, **options) - def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): - """ - LinePlot selected genes expression levels. Max number of genes 200 + def lineplot(self, title="", legend=True, z_score=False, export=False, size=10, ci=None, **option): + """LinePlot selected genes expression levels. Max number of genes 200 title - accept a str as title of the plot legend - True/False show the legend @@ -702,69 +706,111 @@ def lineplot(self, title="", legend=True, z_score=False, export=False, df=None, export - True/False whether or not export the image df - accept an exernal dataframe, different from self.selected **options - all the options accepted by seaborn.factorplot""" - self.plot.lineplot(self, title, legend, z_score, export, df, size, ci, **option) + self.plot.lineplot(self, title, legend, z_score, export, size, ci, **option) class _Manipulate_list: - def _sub_select(self, pp, comparison, comparison_sign, fold_ind, fold_sign): - """ Part of get_gene/get_isoform function""" + """Class containing all the functions to manipulate a Papillon_list object""" + def _sub_select(self, pp, comparisons, comparison): + """Calculate significant expressed genes/isoform""" - ACCEPTED_SIGN = [">", "<", None] - - if comparison_sign is not None: - if comparison_sign not in ACCEPTED_SIGN: - raise Exception('Only ">" "<" usable.') + def fc_select(df,comparisons,comparison): if comparison is None: - raise Exception("Comparison_sign passed, but not comparison") + if pp.fold_ind is not None: + df2 = pd.DataFrame(columns=df.columns) + for comp in comparisons: + if pp.fold_sign is ">": + we = df[df["fi_log2_"+comp] >= pp.fold_ind] + we2 = df[df["fi_log2_"+comp] <= -pp.fold_ind] + we3 = we.combine_first(we2) + elif pp.fold_sign is "<": + we3 = df[(df["fi_log2_"+comp] <= pp.fold_ind) & (df["fi_log2_"+comp] >= -pp.fold_ind)] + df2 = df2.combine_first(we3) + else: return df + else: + if pp.fold_ind is not None: + df2 = pd.DataFrame(columns=df.columns) + for comp in comparisons: + if pp.fold_sign is ">": + we = df[(df[comp] == True) & (df["fi_log2_"+comp] >= pp.fold_ind)] #comp==True only if comparison not True + we2 = df[(df[comp] == True) & (df["fi_log2_"+comp] <= -pp.fold_ind)] + we3 = we.combine_first(we2) + + elif pp.fold_sign is "<": + we3 = df[(df[comp] == True) & (df["fi_log2_"+comp] <= pp.fold_ind) & (df["fi_log2_"+comp] >= -pp.fold_ind)] + df2 = df2.combine_first(we3) + else: return df + return df2 - selected = pp.df - if comparison is not None: - selected = selected[selected[comparison] == True] - sample1, sample2 = _vs(comparison) - if comparison_sign == ">": - selected = selected[selected[_FPKM(sample1)] > selected[_FPKM(sample2)]] - elif comparison_sign == "<": - selected = selected[selected[_FPKM(sample1)] < selected[_FPKM(sample2)]] - if fold_ind is not None: - fi=[str("fi_log2_")+comp for comp in self.comparison] - if fold_sign is ">": - TrueFalseMask1=selected[fi]>fold_ind# or [df[fi]<-fold_ind] - TrueFalseMask2=selected[fi]<-fold_ind - if fold_sign is "<": - TrueFalseMask1=selected[fi]-fold_ind - - TrueFalseMask1=TrueFalseMask1.any(axis=1) - TrueFalseMask2=TrueFalseMask2.any(axis=1) -# TrueFalseMask=pd.merge(TrueFalseMask1,TrueFalseMask2,how="left") - TrueFalseMask=selected.copy() - TrueFalseMask["A"]=TrueFalseMask1 - TrueFalseMask["B"]=TrueFalseMask2 - TrueFalseMask=TrueFalseMask.loc[:,["A","B"]] - TrueFalseMask=TrueFalseMask.any(axis=1) - selected=selected[TrueFalseMask] - pp.df=selected.copy() - + #input check + if pp.comparison_sign not in [">", "<", None]: + raise Exception('Only ">" "<" usable.') + + # to add p=0.05 + + if comparison is None: + if pp.comparison_sign is not None: + raise Exception("Sign passed but not comparison") + else: + df_moment=pp.df.copy() + comp=comparisons + selected = fc_select(df_moment,comp,comparison) + pp.df = selected.copy() + else: + if comparison is "All" or comparison is "all": + if pp.comparison_sign is not None: + raise Exception("Sign passed but not comparison") + else: + comp=comparisons + df_moment = pp.df[pp.df.loc[:, comp].any(axis=1)] + elif comparison in comparisons: + sample1, sample2 = _vs(comparison) + df_moment = pp.df[pp.df.loc[:, comparison]] + if pp.comparison_sign == ">": + df_moment = df_moment[df_moment[_FPKM(sample1)] > df_moment[_FPKM(sample2)]] + elif pp.comparison_sign == "<": + df_moment = df_moment[df_moment[_FPKM(sample1)] < df_moment[_FPKM(sample2)]] + comp=[comparison] + else: + raise Exception("Comparison not found") + selected = fc_select(df_moment,comp,comparison) + pp.df = selected.copy() + pp.comparison=comp print("\nNumber of ", pp.what," selected: ", len(pp.df)) def show(self, pp): + """Show genes/isoforms as Dataframe """ print(pp.df) - - def _export(self, thing, export, name=None): # TO DO - Not working. To activate - """Manage dataframe or image export parameter.""" - if export is False: - return - elif export is True: - _make_folder(self.path) - thing.to_excel(str(self.path + name + '.xls'), sheet_name='Sheet1') - print("\nExported as " + name + ".xls\n") - else: - raise Exception("export= can be only 'False' or 'True'") - def swap_gene_isoform(self): + def compare(self, pp, other, return_="common"): #to do, show list gene, return list + """Compare two papillon objects""" + A=pp.df.index.tolist() + B=other.df.index.tolist() + common=list(set(A).intersection(B)) + onlyA,onlyB=[],[] + for ele in A: + if ele not in common: + onlyA.append(ele) + for ele in B: + if ele not in common: + onlyB.append(ele) + print("Number of ",pp.what,"in common: ",len(common)) + print("Number of ",pp.what,"in the first Papillion_list: ",len(A),". Not in common with the second:",len(onlyA)) + print("Number of ",other.what,"in the second Papillion_list: ",len(B),". Not in common with the second:",len(onlyB)) +# if return_==common: +# pass + + def export(self, pp, name="Table"): + """Export the selected genes/isoforms as excel file.""" + _make_folder(pp.path) + pp.df.to_excel(str(pp.path + name + '.xls'), sheet_name='Sheet1') + print("\nExported as " + name + ".xls\n") + + def search(self): pass #TO DO + class _Plot: """ """ @@ -772,7 +818,6 @@ class _Plot: def _fusion_gene_id(df, what, change_index=False): """Append a "gene/ID" column to the dataframe, and use gene name+id(index) as values, usable or not as index""" - # print(df) if what == "gene": if change_index is True: df.set_index('gene_short_name', inplace=True) @@ -822,6 +867,12 @@ def heatmap(self, pp, z_score=True, col_cluster=False, method="complete", cmap=" **options - all the options accepted by seaborn.clustermap default metric is euclidean. """ + + df_not_sign=pp.df[~pp.df.loc[:, pp.comparison].any(axis=1)] + + + if len(df_not_sign) != 0: + raise Exception("Not significant genes detected. Please exlude them or use another plot.") if len(pp.samples) > 10: raise Exception("High-dimensional data. Ronan et al., 2016") print("Number of genes", len(pp.df)) @@ -838,12 +889,12 @@ def heatmap(self, pp, z_score=True, col_cluster=False, method="complete", cmap=" z_score = None small = sns.clustermap( df_heatmap, col_cluster=col_cluster, method=method, cmap=cmap, z_score=z_score, **options) - self._export(small, pp.path, name="small-heatmap", export=export) + self.export(small, pp.path, name="small-heatmap", export=export) if len(df_heatmap) < 1000 and len(df_heatmap) > 25: big = sns.clustermap( df_heatmap, col_cluster=col_cluster, method=method, cmap=cmap, z_score=z_score, figsize=((len(pp.samples)), int(len(df_heatmap.index) / 4)), **options) - self._export(big, pp.path, name="big-heatmap", export=export) + self.export(big, pp.path, name="big-heatmap", export=export) elif len(df_heatmap) > 1000: print("Too many genes for a big heatmap") @@ -861,8 +912,8 @@ def _z_score(df): df = df.sub(df_mean, axis="index") df = df.div(df_std, axis="index") return df - - def lineplot(self, pp, title="", legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option): + + def lineplot(self, pp, title, legend, z_score, export, size, ci, **option): """ LinePlot selected genes expression levels. Max number of genes 200 @@ -873,43 +924,49 @@ def lineplot(self, pp, title="", legend=True, z_score=False, export=False, df=No df - accept an exernal dataframe, different from self.selected **options - all the options accepted by seaborn.factorplot""" - if df is None: - df = pp.df.copy() - samples=pp.samples - - if z_score is True: - df_ = self.onlyFPKM(df, samples, return_as="df", remove_FPKM_name=True) - df_norm = self._z_score(df_) - df_norm["gene_short_name"] = df["gene_short_name"] - df_ = df_norm.copy() - elif z_score is False: - df_ = self.onlyFPKM(df, samples, return_as="gene name", remove_FPKM_name=True) - - print("Number of genes to plot: ", len(df_)) - if len(df_) > 50 and len(df_) < 200: - print("Too many genes. Legend not shown") - legend = False - elif len(df_) >= 200: - print("Too many genes. Plot not shown") - return - - if pp.what == "gene": - hue = "gene_short_name" - df_ = self._fusion_gene_id( - df_, pp.what, change_index=False) - elif pp.what == "isoform": - hue = "gene/ID" # Change this hue name - df_ = self._fusion_gene_id( - df_, pp.what, change_index=True) - df_ = df_.reset_index() - df = pd.melt(df_, id_vars=[hue], var_name="Sample", value_name="FPKM") - g = sns.factorplot(x="Sample", y="FPKM", hue=hue, - data=df, ci=ci, size=size, legend=legend, **option) - g.fig.suptitle(title) - self._export(g, pp.path, export=export, name="Plot") - return g + def subplot(df, pp, title="", legend=True, z_score=False, export=False, size=10, ci=None, **option): + if z_score is True: + df_ = self.onlyFPKM(df, pp.samples, return_as="df", remove_FPKM_name=True) + df_norm = self._z_score(df_) + df_norm["gene_short_name"] = df["gene_short_name"] + df_ = df_norm.copy() + elif z_score is False: + df_ = self.onlyFPKM(df, pp.samples, return_as="gene name", remove_FPKM_name=True) + + print("Number of genes to plot: ", len(df_)) + if len(df_) > 50 and len(df_) < 200: + print("Too many genes. Legend not shown") + legend = False + elif len(df_) >= 200: + print("Too many genes. Plot not shown") + return + + if pp.what == "gene": + hue = "gene_short_name" + df_ = self._fusion_gene_id( + df_, pp.what, change_index=False) + elif pp.what == "isoform": + hue = "gene/ID" # Change this hue name + df_ = self._fusion_gene_id( + df_, pp.what, change_index=True) + df_ = df_.reset_index() + df = pd.melt(df_, id_vars=[hue], var_name="Sample", value_name="FPKM") + g = sns.factorplot(x="Sample", y="FPKM", hue=hue, + data=df, ci=ci, size=size, legend=legend, **option) + g.fig.suptitle(title) + self.export(g, pp.path, export=export, name="Plot") + return g + + df_sign=pp.df[pp.df.loc[:, pp.comparison].any(axis=1)] + df_not_sign=pp.df[~pp.df.loc[:, pp.comparison].any(axis=1)] + + if len(df_sign) != 0: + subplot(df_sign, pp, title=title+" Significant in AT LEAST one condition", legend=legend, z_score=z_score, export=export, size=size, ci=ci, **option) + if len(df_not_sign) != 0: + subplot(df_not_sign, pp, title="Not significant", legend=legend, z_score=z_score, export=export, size=size, ci=ci, linestyles=len(df_not_sign)*["--"], file_name="Plot not significant", **option) + - def _export(self, thing, path, export, name=None, image_extension=".png"): # add .pdf? + def export(self, thing, path, export, name=None, image_extension=".png"): # add .pdf? """Manage dataframe or image export parameter.""" if export is False: return @@ -918,4 +975,4 @@ def _export(self, thing, path, export, name=None, image_extension=".png"): # ad thing.savefig(str(path + name + image_extension)) print("\nExported as " + name + image_extension) else: - raise Exception("export= can be only 'False' or 'True'") + raise Exception("export= can be only 'False' or 'True'") \ No newline at end of file diff --git a/test/Test_files/isoform_exp.diff b/test/Test_files/isoform_exp.diff index 42ae958..a16b5a5 100644 --- a/test/Test_files/isoform_exp.diff +++ b/test/Test_files/isoform_exp.diff @@ -1,5 +1,5 @@ test_id gene_id gene locus sample_1 sample_2 status value_1 value_2 log2(fold_change) test_stat p_value q_value significant -NM_000600.3 IL6 IL6 chr7:22727146-22732002 Sample 1 Sample 2 NOTEST 0 2 0 0 1 0.01 yes +NM_000600.3 IL6 IL6 chr7:22727146-22732002 Sample 1 Sample 2 NOTEST 0 2 1 0 1 0.01 yes XM_005249745.2 IL6 IL6 chr7:22727146-22732002 Sample 1 Sample 2 NOTEST 0 0 0 0 1 1 no NM_000600.3 IL6 IL6 chr7:22727146-22732002 Sample 1 Sample 3 NOTEST 0 0.029063 inf 0 1 1 no XM_005249745.2 IL6 IL6 chr7:22727146-22732002 Sample 1 Sample 3 NOTEST 0 0 0 0 1 1 no @@ -29,8 +29,8 @@ rna22716 IL15 IL15 chr4:141636595-141733987 Sample 2 Sample 4 NOTEST 0.0190054 0 NM_000585.4 IL15 IL15 chr4:141636595-141733987 Sample 3 Sample 4 NOTEST 0.0433909 0.0356508 -0.283458 0 1 1 no NM_172175.2 IL15 IL15 chr4:141636595-141733987 Sample 3 Sample 4 NOTEST 0.0154687 5.18235E-06 -11.5435 0 1 1 no rna22716 IL15 IL15 chr4:141636595-141733987 Sample 3 Sample 4 NOTEST 0 0.0136664 inf 0 1 1 no -NM_000610.3 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 0 3 0 0 1 0.01 yes -NM_001001389.1 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 4 0 0 0 1 0.02 yes +NM_000610.3 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 0 3 1.5 0 1 0.01 yes +NM_001001389.1 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 4 0 -2 0 1 0.02 yes NM_001001390.1 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 0 0 0 0 1 1 no NM_001001391.1 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 1.37211E-06 1.4091E-06 0.0383775 0 1 1 no NM_001001392.1 CD44 CD44 chr11:35136345-35232402 Sample 1 Sample 2 NOTEST 0.025306 0.0425433 0.749454 0 1 1 no @@ -320,6 +320,6 @@ NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 2 Sample 3 NM_032965.4 CCL15 CCL15 chr17:35983655-36002038 Sample 1 Sample 4 NOTEST 0 0 0 0 1 1 no NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 1 Sample 4 NOTEST 0 0 0 0 1 1 no NM_032965.4 CCL15 CCL15 chr17:35983655-36002038 Sample 2 Sample 4 NOTEST 0 0 0 0 1 1 no -NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 2 Sample 4 NOTEST 0 3 0 0 1 0.01 yes -NM_032965.4 CCL15 CCL15 chr17:35983655-36002038 Sample 3 Sample 4 NOTEST 2 0 0 0 1 0.01 yes -NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 3 Sample 4 NOTEST 0 3 0 0 1 0.01 yes +NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 2 Sample 4 NOTEST 0 3 1.5 0 1 0.01 yes +NM_032965.4 CCL15 CCL15 chr17:35983655-36002038 Sample 3 Sample 4 NOTEST 2 0 -1 0 1 0.01 yes +NM_032965.4-2 CCL15-2 CCL15 chr17_KI270857v1_alt:218219-236612 Sample 3 Sample 4 NOTEST 0 3 1.5 0 1 0.01 yes diff --git a/test/test_papillon.py b/test/test_papillon.py index aa99f19..88edd88 100644 --- a/test/test_papillon.py +++ b/test/test_papillon.py @@ -36,7 +36,7 @@ def test_different_read(self): pp.read_folder(path) pp.read_folder(path+"/galaxy") pp.read_files([path+"/gene_exp.diff",path+"/genes.fpkm_tracking",path+"/isoform_exp.diff",path+"/isoforms.fpkm_tracking"]) - + def test_functions_FPKM(self): self.assertEqual(pp._FPKM("ciao"),"ciao_FPKM") self.assertEqual(pp._FPKM("ciao_FPKM"),"ciao") @@ -53,7 +53,7 @@ def test_functions__obtain_list(self): self.assertEqual(len(pp._obtain_list("test49.list",test.path)),49) self.assertEqual(pp._obtain_list(["ciao","hello"],"fake path"),["ciao","hello"]) - def test_read_folder(self): + def test_papillon_db(self): test=pp.read_folder(path) samples_test=['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4'] self.assertTrue(test.samples==samples_test) @@ -61,131 +61,349 @@ def test_read_folder(self): 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4'] self.assertTrue(test.comparisons==comparison_test) - self.assertEqual(len(test.genes_detect),5) - self.assertEqual(len(test.genes_significant),3) - self.assertEqual(len(test.isoforms_detect),28) - self.assertEqual(len(test.isoforms_significant),5) - a=len(test.genes_detect.columns) - b=len(test.genes_significant.columns) - c=len(test.isoforms_detect.columns) - d=len(test.isoforms_significant.columns) - self.assertTrue(a==b and b==c and c==d and d==18) + self.assertEqual(len(test.genes_detected),5) + self.assertEqual(len(test.isoforms_detected),28) + a=len(test.genes_detected.columns) + c=len(test.isoforms_detected.columns) + self.assertTrue(a==c and c==24) print_test=pp.read_folder(path) printable="Samples: ['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4']\nComparison: ['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4']\nGenes Detected: 5\nGenes differential expressed: 3\nIsoform Detected: 28\nIsoform differential expressed: 5\n" print(print_test.__str__(),"\n",printable) self.assertTrue(print_test.__str__()==printable) del print_test + def test_significant(self): + test=pp.read_folder(path) + self.assertEqual(len(test.Manipulate.significant(test,"gene")),3) + self.assertEqual(len(test.Manipulate.significant(test,"isoform")),5) + b=len(test.Manipulate.significant(test,"gene").columns) + d=len(test.Manipulate.significant(test,"isoform").columns) + self.assertTrue(b==d and d==24) + def test_get_gene(self): sub=test.get_gene() self.assertEqual(len(sub.df),3) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + # genelist sub=test.get_gene("IL17RC") self.assertEqual(sub.df.index[0],"MSTRG.10454") self.assertEqual(len(sub.df),1) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_gene(["IL6","CCL15"]) self.assertEqual(len(sub.df),2) self.assertEqual(sub.df.index[0],"IL6") self.assertEqual(sub.df.index[1],"CCL15-2") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL17RC","CCL15","CD44"]) + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(sub.df.index[-1],"CCL15-2") + self.assertEqual(len(sub.df.columns),24) + + # Gene-Comparison Test + sub=test.get_gene(comparison=None) + self.assertEqual(len(sub.df),5) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(sub.df.index[-1],"CCL15-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(comparison="Sample 2_vs_Sample 4") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(comparison="Sample 1_vs_Sample 2", comparison_sign=">") + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(comparison="Sample 1_vs_Sample 2", comparison_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),24) + + # Fold test + sub=test.get_gene(fold_sign="<") # ? + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(fold_ind=1) + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[1],"IL6") + self.assertEqual(sub.df.index[0],"CCL15-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(fold_ind=0.5) + self.assertEqual(len(sub.df),3) + self.assertEqual(sub.df.index[0],"CCL15-2") + self.assertEqual(sub.df.index[-1],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(fold_ind=1,fold_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(fold_ind="1",fold_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) - # Gene-Comparison Test + # Combinations sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"IL6") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_gene(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"CCL15-2") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_gene(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") self.assertEqual(len(sub.df),0) - self.assertEqual(len(sub.df.columns),18) - - # Gene-Comparison-sign Test + self.assertEqual(len(sub.df.columns),24) + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign=">") self.assertEqual(len(sub.df),0) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"IL6") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL17RC","CCL15","CD44"], comparison=None) + self.assertEqual(len(sub.df),3) + self.assertEqual(sub.df.index[0],"CD44") + self.assertEqual(sub.df.index[-1],"CCL15-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL6","CD44"], fold_ind="1") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL17RC","CCL15"], fold_ind="1", fold_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["CD44","IL6"], comparison=None, fold_ind="1") + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[1],"IL6") + self.assertEqual(sub.df.index[0],"CD44") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", fold_ind="1") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL6","CCL15","CD44"], comparison=None, fold_ind=3.29, fold_sign=">") + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[1],"IL6") + self.assertEqual(sub.df.index[0],"CCL15-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign=">") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"IL6") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign=">", fold_ind=0.5, fold_sign=">") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + sub=test.get_gene(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign=">", fold_ind=1, fold_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"MSTRG.10454") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_gene(["CD44","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign="<") + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),24) + sub=test.get_gene() self.assertEqual(len(sub.df),3) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) with self.assertRaises(Exception): sub=test.get_gene(comparison="Sample 1_vs_Sample 2", comparison_sign="Wrong") with self.assertRaises(Exception): - sub=test.get_gene(sign=">") + sub=test.get_gene(comparison_sign=">") with self.assertRaises(Exception): sub=test.get_gene(comparison="Wrong") + with self.assertRaises(Exception): + sub=test.get_gene(fold_sign="Wrong") + with self.assertRaises(Exception): + sub=test.get_gene(fold_ind=1, fold_sign="Wrong") + with self.assertRaises(Exception): + sub=test.get_gene(fold_ind="Wrong") + with self.assertRaises(Exception): + sub=test.get_gene(fold_ind=-1) def test_get_isoform(self): + + # Final sub=test.get_isoform() self.assertEqual(len(sub.df),5) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform() + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),24) + # genelist sub=test.get_isoform("IL6") self.assertEqual(sub.df.index[0],"NM_000600.3") self.assertEqual(len(sub.df),1) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform("CCL15") self.assertEqual(sub.df.index[0],"NM_032965.4") self.assertEqual(sub.df.index[1],"NM_032965.4-2") self.assertEqual(len(sub.df),2) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CD44"]) self.assertEqual(len(sub.df),3) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CCL15"]) self.assertEqual(len(sub.df),3) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + # Gene-Comparison Test + sub=test.get_isoform(comparison=None) + self.assertEqual(len(sub.df),28) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(sub.df.index[-1],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(comparison="Sample 2_vs_Sample 4") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) # Isoform-Comparison Test sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"NM_000600.3") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 3_vs_Sample 4") self.assertEqual(len(sub.df),2) self.assertEqual(sub.df.index[0],"NM_032965.4") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 4") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"NM_032965.4-2") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 2_vs_Sample 3") self.assertEqual(len(sub.df),0) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) # Isoform-Comparison-sign Test sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign=">") self.assertEqual(len(sub.df),0) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<") self.assertEqual(len(sub.df),1) self.assertEqual(sub.df.index[0],"NM_000600.3") - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + # Fold test + sub=test.get_isoform(fold_sign="<") # ? So far is ignored. should it return error? + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),24) - # Final + sub=test.get_isoform(fold_ind=1) + self.assertEqual(len(sub.df),5) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(sub.df.index[-1],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(fold_ind=1.1) + self.assertEqual(len(sub.df),3) + self.assertEqual(sub.df.index[0],"NM_000610.3") + self.assertEqual(sub.df.index[-1],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(fold_ind=1,fold_sign="<") + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(sub.df.index[1],"NM_032965.4") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(fold_ind="1",fold_sign="<") + self.assertEqual(len(sub.df),2) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(sub.df.index[1],"NM_032965.4") + self.assertEqual(len(sub.df.columns),24) + + # Combinations + sub=test.get_isoform(["IL17RC","CCL15","CD44"], comparison=None) + self.assertEqual(len(sub.df),24) + self.assertEqual(sub.df.index[0],"NM_000610.3") + self.assertEqual(sub.df.index[-1],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["CD44","IL6"], comparison=None, fold_ind="1") + self.assertEqual(len(sub.df),8) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["CD44","IL6"], comparison=None, fold_ind="1.6") + self.assertEqual(len(sub.df),7) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["CD44","IL6"], comparison=None, fold_ind="2.1") + self.assertEqual(len(sub.df),6) + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["IL6","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign=">") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_000600.3") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign="<", fold_ind=1.5, fold_sign=">") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign="<", fold_ind=1.6, fold_sign="<") + self.assertEqual(len(sub.df),1) + self.assertEqual(sub.df.index[0],"NM_032965.4-2") + self.assertEqual(len(sub.df.columns),24) + + sub=test.get_isoform(["CD44","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign="<") + self.assertEqual(len(sub.df),0) + self.assertEqual(len(sub.df.columns),24) + sub=test.get_isoform() self.assertEqual(len(sub.df),5) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) + + with self.assertRaises(Exception): + sub=test.get_isoform(comparison="Sample 1_vs_Sample 2", comparison_sign="Wrong") + with self.assertRaises(Exception): + sub=test.get_isoform(comparison_sign=">") + with self.assertRaises(Exception): + sub=test.get_isoform(comparison="Wrong") + with self.assertRaises(Exception): + sub=test.get_isoform(fold_sign="Wrong") + with self.assertRaises(Exception): + sub=test.get_isoform(fold_ind=1, fold_sign="Wrong") + with self.assertRaises(Exception): + sub=test.get_isoform(fold_ind="Wrong") + with self.assertRaises(Exception): + sub=test.get_isoform(fold_ind=-1) def test_onlyFPKM(self): test=pp.read_folder(path) @@ -265,7 +483,7 @@ def test_onlyFPKM(self): # Final sub=test.get_isoform() self.assertEqual(len(sub.df),5) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) def test_z_score(self): sub=test.get_isoform() @@ -293,40 +511,40 @@ def test_search(self): search_result=test.search(word="IL6",where="genes_detected", how="table") self.assertTrue(type(search_result)==pd.DataFrame) self.assertEqual(len(search_result),1) - self.assertEqual(len(search_result.columns),18) + self.assertEqual(len(search_result.columns),24) self.assertEqual(search_result.index[0],"IL6") search_result=test.search(word="CD44",where="isoforms_detected", how="table") self.assertTrue(type(search_result)==pd.DataFrame) self.assertEqual(len(search_result),7) - self.assertEqual(len(search_result.columns),18) + self.assertEqual(len(search_result.columns),24) self.assertEqual(search_result.index[0],"NM_000610.3") self.assertEqual(search_result.index[-1],"XM_006718390.1") search_result=test.search(word="IL6",where="genes_significant", how="table") self.assertTrue(type(search_result)==pd.DataFrame) self.assertEqual(len(search_result),1) - self.assertEqual(len(search_result.columns),18) + self.assertEqual(len(search_result.columns),24) self.assertEqual(search_result.index[0],"IL6") # search_result=test.search(word="CCL15",where="genes_significant", how="selected") # self.assertEqual(len(test.df),1) -# self.assertEqual(len(test.df.columns),18) +# self.assertEqual(len(test.df.columns),24) # self.assertEqual(test.df.index[0],"CCL15-2") # search_result=test.search(word="CD44",where="isoforms_significant", how="selected") # self.assertEqual(len(test.df),2) -# self.assertEqual(len(test.df.columns),18) +# self.assertEqual(len(test.df.columns),24) # self.assertEqual(test.df.index[0],"NM_000610.3") # self.assertEqual(test.df.index[-1],"NM_001001389.1") sub=test.get_isoform() self.assertEqual(len(sub.df),5) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) sub=test.get_gene() self.assertEqual(len(sub.df),3) - self.assertEqual(len(sub.df.columns),18) + self.assertEqual(len(sub.df.columns),24) def test_fusion_gene_id(self): sub=test.get_isoform() @@ -335,7 +553,7 @@ def test_fusion_gene_id(self): df2=sub.plot._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2["gene/ID"]),5) - self.assertEqual(len(df2.columns),19) + self.assertEqual(len(df2.columns),25) self.assertEqual(df2["gene/ID"][0],"IL6 NM_000600.3") self.assertEqual(df2["gene/ID"][-1],"CCL15 NM_032965.4-2") self.assertEqual(df2.index[0],"NM_000600.3") @@ -345,7 +563,7 @@ def test_fusion_gene_id(self): df2=sub.plot._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),5) - self.assertEqual(len(df2.columns),17) + self.assertEqual(len(df2.columns),23) self.assertEqual(df2.index[0],"IL6 NM_000600.3") self.assertEqual(df2.index[-1],"CCL15 NM_032965.4-2") @@ -353,13 +571,13 @@ def test_fusion_gene_id(self): df=sub.df.copy() df2=sub.plot._fusion_gene_id(df, sub.what, change_index=False) self.assertEqual(len(df2),3) - self.assertEqual(len(df2.columns),18) + self.assertEqual(len(df2.columns),24) self.assertEqual(df2.index[1],"MSTRG.10454") self.assertEqual(df2.index[-1],"CCL15-2") df2=sub.plot._fusion_gene_id(df, sub.what, change_index=True) self.assertEqual(len(df2),3) - self.assertEqual(len(df2.columns),17) + self.assertEqual(len(df2.columns),23) self.assertEqual(df2.index[0],"IL6") self.assertEqual(df2.index[-1],"CCL15") @@ -377,8 +595,11 @@ def drop(comp): test2=pp.read_folder(path,drop_comparison=comp) test3=pp.read_folder(path) test3.drop_comparison(comp) - df1=test2.genes_significant.all() - df2=test3.genes_significant.all() + df1=test2.genes_detected.all() + df2=test3.genes_detected.all() + self.assertTrue(df1.all()==df2.all()) + df1=test2.isoforms_detected.all() + df2=test3.isoforms_detected.all() self.assertTrue(df1.all()==df2.all()) def multidrop(comp): @@ -387,8 +608,11 @@ def multidrop(comp): test2.drop_comparison(comp) for c in comp: test3.drop_comparison(c) - df1=test2.genes_significant.all() - df2=test3.genes_significant.all() + df1=test2.genes_detected.all() + df2=test3.genes_detected.all() + self.assertTrue(df1.all()==df2.all()) + df1=test2.isoforms_detected.all() + df2=test3.isoforms_detected.all() self.assertTrue(df1.all()==df2.all()) drop("Sample 1_vs_Sample 2") @@ -416,15 +640,15 @@ def test_change_samples_order(self): self.assertTrue(test.samples==samples_test) comparison_test=['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4'] self.assertTrue(test.comparisons==comparison_test) - self.assertEqual(len(test.genes_detect),5) - self.assertEqual(len(test.genes_significant),3) - self.assertEqual(len(test.isoforms_detect),28) - self.assertEqual(len(test.isoforms_significant),5) - a=len(test.genes_detect.columns) - b=len(test.genes_significant.columns) - c=len(test.isoforms_detect.columns) - d=len(test.isoforms_significant.columns) - self.assertTrue(a==b and b==c and c==d and d==18) + self.assertEqual(len(test.genes_detected),5) + self.assertEqual(len(test.Manipulate.significant(test,"gene")),3) + self.assertEqual(len(test.isoforms_detected),28) + self.assertEqual(len(test.Manipulate.significant(test,"isoform")),5) + a=len(test.genes_detected.columns) + b=len(test.Manipulate.significant(test,"gene").columns) + c=len(test.isoforms_detected.columns) + d=len(test.Manipulate.significant(test,"isoform").columns) + self.assertTrue(a==b and b==c and c==d and d==24) with self.assertRaises(Exception): test.change_order(["Sample 4","Sample 3","Sample 2"]) @@ -458,11 +682,10 @@ def plot_maker(type_sel,z_score): df_ = sub.plot._fusion_gene_id(df_, type_sel, change_index=True) df_ = df_.reset_index() -# df_ = test.plot._fusion_gene_id(df_, type_sel, change_index=False) - df = pd.melt(df_, id_vars=hue, var_name="Sample", value_name="FPKM") g = sns.factorplot(x="Sample", y="FPKM", hue=hue, data=df, ci=None, legend=True, size=10) + g.fig.suptitle(" Significant in AT LEAST one condition") g.savefig(str(test.path + "test_plot.png")) def image_check(): From fc257a56c9c58b1350523fb43c0ff266ae4f6c86 Mon Sep 17 00:00:00 2001 From: domenico-somma <34346930+domenico-somma@users.noreply.github.com> Date: Sun, 23 Sep 2018 13:28:24 +0100 Subject: [PATCH 4/8] Papillon 0.2 Now seaborn 0.23 required Now you can keep the genes/isoforms subselection in a variable Add or compare two subselections You can select either gene/isoform significant expressed for at least one condition or not significant at all. Plot gene/isoform significant for at least one condition or not significant with continuous and dashed line. self.read_db() deprecated self.plot() deprecated Add check if papillon folder is removed Improved code (Single Responsibility Principle) --- CHANGE.txt | 1 + TO DO.txt | 5 +- docs/manual.rst | 495 +++++++++--------- docs/papillon.html | 325 ++++++++++++ docs/pydoc papillon.html | 233 --------- papillon.py | 335 +++++++++--- .../Papillon/Table gene All them.xls | Bin 0 -> 10240 bytes .../Papillon/Table isoform all them.xls | Bin 0 -> 20992 bytes test/test_papillon.py | 105 +++- 9 files changed, 937 insertions(+), 562 deletions(-) create mode 100644 docs/papillon.html delete mode 100644 docs/pydoc papillon.html create mode 100644 test/Test_files/Papillon/Table gene All them.xls create mode 100644 test/Test_files/Papillon/Table isoform all them.xls diff --git a/CHANGE.txt b/CHANGE.txt index d6fd82a..3f93eae 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,4 +1,5 @@ v 0.2.0, -18 -- Major changes: +Now seaborn 0.23 required Now you can keep the genes/isoforms subselection in a variable Add or compare two subselections You can select either gene/isoform significant expressed for at least one condition or not significant at all. diff --git a/TO DO.txt b/TO DO.txt index 5ea5638..ff8514d 100644 --- a/TO DO.txt +++ b/TO DO.txt @@ -1,5 +1,5 @@ * Add Confidence Interval to plot. -* log10 transformation FPKM values +* log2 transformation FPKM values * Add search isoforms ID option * Add barplot * plot genes detected/significant as different lines @@ -15,3 +15,6 @@ * show replicates on lineplot * Box plot of FPKM distributions for individual conditions #see cummeRbund * Density plot of individual conditions #see cummeRbund + +* Add DESEQ2 support +* Add kallisto/sleuth support diff --git a/docs/manual.rst b/docs/manual.rst index 1fd45a4..eaddbed 100644 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -1,451 +1,440 @@ -Papillon is a python alternative to cummeRbund to read and plot cuffdiff/Galaxy -RNA-seq data. - -To start +Papillon ======== -After RNA-seq analysis with `Galaxy `__, +A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq +data + +Before To start +--------------- + +After RNA-seq analysis with Galaxy, + download the 4 files generated by cuffdiff containing respectively: -*... transcript\_FPKM\_tracking...* +... transcript\_FPKM\_tracking... -*... gene\_FPKM\_tracking ...* +... gene\_FPKM\_tracking ... -*... gene\_differential\_expression...* +... gene\_differential\_expression... -*... transcript\_differential\_expression...* +... transcript\_differential\_expression... And put them in the same folder. You can either use directly the files or change the names according to + cummeRbund in: -*... transcript\_FPKM\_tracking...* = isoforms.fpkm\_tracking +... transcript\\\_FPKM\\\_tracking... = isoforms.fpkm\_tracking -*... gene\_FPKM\_tracking ...* = gene.fpkm\_tracking +... gene\\\_FPKM\\\_tracking ... = gene.fpkm\_tracking -*... gene\_differential\_expression...* = gene\_exp.diff +... gene\\\_differential\\\_expression... = gene\_exp.diff -*... transcript\_differential\_expression...* = isoform\_exp.diff +... transcript\\\_differential\\\_expression... = isoform\_exp.diff +Functions +--------- -Commands -======== +read\_db(path, drop\_comparison=None) + + Deprecated. Use read\_folder() instead. + +read\_files(files, path=None, drop\_comparison=None) + + Accept cuffdiff/cummeRbund files as iterable + + ("transcript\_FPKM\_tracking", + + "gene\_FPKM\_tracking", + + "gene\_differential\_expression", -Functions: ----------- + "transcript\_differential\_expression") -``read_folder(path, drop_comparison=[])`` + and return them to \_papillon\_builder() to create a Papillon\_db +object. -Read the cuffdiff/cummeRbund files. Return a class papillon. + Parameters -**path**: *str* + ---------- -folder path and folder name, containing the files + files - accept an iterable with the cuffdiff files names -**drop\_comparison**: *str or list* + path - where to export Papillon generated files -what comparison do not import + drop\_comparison - drop comparison (str) or list of comparisons to drop +from the cuffdiff table -i.e.: + Example -:: + ---------- - import papillon as pl - MyExp=pl.read_folder("My Experiment", drop_comparison=["Sample 1_vs_Sample 2","Sample 3_vs_Sample 4"]) +pp.read\_files(["Files/gene\_exp.diff","Files/genes.fpkm\_tracking","Files/isoform\_exp.diff","Files/isoforms.fpkm\_tracking"]) --------------- +read\_folder(path, drop\_comparison=None) + Read the folder containing the cuffdiff/cummeRbund files, and return -``read_files(files, path=None, drop_comparison=None)`` + them to \_papillon\_builder() to create a Papillon\_db object. -Accept cuffdiff/cummeRbund files as iterable. Return a class papillon. + Parameters -**files**: *iterable* + ---------- -iterable with cuffdiff files + path - accept a str with the folder path, containing the cuffdiff files -**path**: *str* + drop\_comparison - drop comparison (str) or list of comparisons to drop +from the cuffdiff table -folder path where export Papillon generated files + Example -**drop\_comparison**: *str or iterable* + ---------- -what comparison do not import + MyProject=pp.read\_folder("MyFolder/Test\_files") -i.e.: +Classes +------- -:: +class Papillon\_db(builtins.object) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - import papillon as pl + Create a Papillon\_db object using read\_folder() or read\_files() and +\_papillon\_builder - MyExp=pl.read_files(["file1.tab","file2.tab","file3.tab","file4.tab",],"My Experiment") + Methods defined here: --------------- +\_\_init\_\_(self, path, samples, comparisons, genes\_detected, +isoforms\_detected) + self.path - files path -``read_db`` is deprecated + self.samples - samples found --------------- + self.comparisons - comparisons found + self.genes\_detected - dataframe of genes detected -Class: ------- + self.isoforms\_detected - dataframe of isoforms detected -``papillon`` +\_\_str\_\_(self) -this class store cuffdiff/cummeRbund data. The class has the following -attributes: + Return str(self). -``self.path`` - files path +change\_order(self, new\_order) -``self.samples`` - samples found + Change the samples order -``self.comparison`` - comparisons found + Parameters -``self.genes_detect`` - dataframe of genes detected + ---------- -``self.genes_significant`` - dataframe of genes significant expressed -among comparisons + new\_order: list of samples order -``self.isoforms_detect`` - dataframe of isoforms detected + Example -``self.isoforms_significant`` - dataframe of isoforms significant -expressed among comparisons + ---------- -Some papillon class methods create + MyProject.change\_order(["Sample 4","Sample 3","Sample 2","Sample 1"]) -``self.selected`` - contains a list of genes/isoforms selected -(according with ``self.type_selected``) +drop\_comparison(self, comparison) -``self.type_selected`` - could be either equal to "gene" or "isoform". -Define the type of ``self.selected`` entry. + Drop Comparison (str) or list of comparisons + Parameters -Class Methods: --------------- -**self.dropComparison(self, comparison)** + ---------- -Drop Comparison or list of comparisons and re-calculate significant -genes/isoforms + comparison: comparison (str) or list of comparisons -**comparison**: *str* or *list* + Example -i.e.: + ---------- -:: + MyProject.drop\_comparison(specific\_comparison) - import papillon as pl +get\_gene(self, genelist=None, comparison='all', comparison\_sign=None, +fold\_ind=None, fold\_sign='>') - MyExp=pl.read_db("My Experiment") + This method select genes per name or conditions. It return a +Papillon\_list object - MyExp.drop_comparison(["Sample 1_vs_Sample 2","Sample 3_vs_Sample 4"]) + Parameters --------------- + ---------- -**self.change_order(self, new_order)** + genelist - accept string (1 gene name), list of gene names or file with +a list of gene names -Change the samples order + comparison - To select genes higher/lower in one condition compared to +another. Accept either "all" to pass all the comparisons, or accept only +1 comparison as str (already present in the data) -**new\_order**: *list* + comparison\_sign - usable in combination with comparison, accept either +">" or "<" -i.e.: + fold\_ind - fold induction (log2) higher/lower then number -:: + fold\_sign - usable in combination with fold\_ind, accept either ">" or +"<" - import papillon as pl + Example - MyExp=pl.read_db("My Experiment") + ---------- - MyExp.change_order(["Sample 4","Sample 3","Sample 2","Sample 1"] + Selection=MyProject.get\_gene(["CD44","CCL15"], comparison="Sample +1\_vs\_Sample 2", comparison\_sign="<", fold\_ind=1, fold\_sign="<") --------------- +get\_isoform(self, genelist=None, comparison='all', +comparison\_sign=None, fold\_ind=None, fold\_sign='>') -**self.get_gene(self, genelist=None, comparison=None, sign=None, export=False)** + This function select isoforms. It creates a Papillon object -This method selects genes. Create ``self.selected`` and -``self.type_selected="gene"`` + Parameters -**genelist**: *None*, *str* or *list* + ---------- -with *None* (default) select all the genes differential expressed in the -select comparison. *str* can be either a gene name or a plain text file -with a list of gene names. *list* accept a list of gene names. + genelist - accept string (1 gene name), list of gene names or file with +a list of gene names -**comparison**: *None* or *str* + comparison - To select genes higher/lower in one condition compared to +another. Accept either "all" to pass all the comparisons, or accept only +1 comparison as str (already present in the data) -with *None* (default) select all the genes differential expressed in at -least one comparison. If *str* is passed, it selects only genes -differential expressed in that specific comparison + comparison\_sign - usable in combination with comparison, accept either +">" or "<" -**sign**: *None*, *">"* or *"<"* + fold\_ind - fold induction (log2) higher/lower then number -with *None* (default) select all the genes differential expressed in the -select comparison. If sign is different from *None*, comparison is -required. If *">"* is passed, it selects all genes higher in the first -term compared to the second one. The opposite with *"<"* + fold\_sign - usable in combination with fold\_ind, accept either ">" or +"<" -I.e. + Example -``self.get_gene(self, comparison="Sample 3_vs_Sample 4", sign=">")`` + ---------- -select all the genes where Sample 3 is higher ( > ) than Sample 4 + Selection=MyProject.get\_isoform(["IL6","CCL15","IL17RC"], +comparison="Sample 2\_vs\_Sample 4", comparison\_sign="<") -**export** - True/False +search(self, word, where, how='table', export=False) -False is default. Whether or not export the selected genes as .xls file. + search among genes/isoforms names in detected and significant -i.e.: + Parameters -:: + ---------- - import papillon as pl + word - accept a str to search among the gene names - MyExp=pl.read_db("My Experiment") + where - accept: - MyExp.get_gene() + "genes\_detected" - MyExp.get_gene(genelist="IL6", export=True) + "genes\_significant" - MyExp.get_gene(genelist="my_list.txt", comparison="Sample 3_vs_Sample 4", sign="<") + "isoforms\_detected" --------------- + "isoforms\_significant" -**self.get_isoform(self, genelist=None, comparison=None, sign=None, export=False, show_dup=False)** + how - accept: -This method selects isoform. Create ``self.selected`` and -``self.type_selected="isoform"`` + "table" return the dataframe with the genes found -**genelist**: *None*, *str* or *list* + "list" return a list of names, no duplicates -with *None* (default) select all the isoforms differential expressed in -the select comparison. *str* can be either a gene name or a plain text -file with a list of gene names. *list* accept a list of gene names. + export - True/False -**comparison**: *None* or *str* + Example -with *None* (default) select all the isoforms differential expressed in -at least one comparison. If *str* is passed, it selects only isoforms -differential expressed in that specific comparison + ---------- -**sign**: *None*, *">"* or *"<"* + search\_result=MyProject.search(word="IL6",where="genes\_significant", +how="table") -with *None* (default) select all the isoforms differential expressed in -the select comparison. If sign is different from *None*, comparison is -required. If *">"* is passed, it selects all isoforms higher in the -first term compared to the second one. The opposite with *"<"* +class Papillon\_list(builtins.object) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -I.e. + Class containing a selected list of genes, with data associated, +generated by Papillon\_db.get\_gene() or Papillon\_db.get\_isoform() +methods -``self.get_isoform(self, comparison="Sample 3_vs_Sample 4", sign=">")`` +Is possible add the content of two or more Papillon\_list objects. -select all the isoforms where Sample 3 is higher ( > ) than Sample 4 +Example: -**export** - True/False +P\_list3 = P\_list1+P\_list2 -False is default. Whether or not export the selected isoforms as .xls -file. +P\_list4=sum([PL1, PL2, PL3]) -**show\_dup** - True/False + Methods defined here: -False is default. Whether or not indicate if there are more then 1 -isoform for each gene name in the exported .xls file (export=True -required) +\_\_add\_\_(self, other) -i.e.: +\_\_init\_\_(self, df, what, comparisons, path, samples, +comparison='all', comparison\_sign=None, fold\_ind=None, fold\_sign='>', +p=0.05) -:: + Initialize self. See help(type(self)) for accurate signature. - import papillon as pl +\_\_radd\_\_(self, other) - MyExp=pl.read_db("My Experiment") +\_\_str\_\_(self) - MyExp.get_isoform() + Return str(self). - MyExp.get_isoform(genelist="IL6", export=True) +compare(self, other) - MyExp.get_isoform(genelist="my_list.txt", comparison="Sample 3_vs_Sample 4", sign="<") + Compare two Papillon\_list objects --------------- + Parameters -**self.onlyFPKM(return_as, **option)** + ---------- -Use self.selected and return a DataFrame with only FPKM values columns. + other: another Papillon\_list object -**return\_as** - *"df"*, *"array"* or *"gene name"* + Example -*"df"* return a pandas DataFrame, *"array"* return a numpy array, *"gene -name"* return a pandas DataFrame containing gene names in addition to -the FPKM values columns + ---------- -\*\*option - accepts extra\_df parameter extra\_df parameter accepts an -external dataframe (it has to be a papillon class format). + Selection1.compare(Selection2) -i.e.: +export(self) -:: + Export the selected genes/isoforms as excel file. - import papillon as pl + Example - MyExp=pl.read_db("My Experiment") + ---------- - MyExp.get_isoform() + Selection.export() - MyExp.onlyFPKM("df") +heatmap(self, z\_score=False, col\_cluster=False, method='complete', +cmap='seismic', export=False, \*\*options) --------------- + Generate heatmap using selected genes/isoforms -**self.heatmap(self, z_score=True, col_cluster=False, method="complete", cmap="seismic", export=False, **options)** + Parameters -Generate a heatmap with self.selected genes/isoforms + ---------- -**z\_score** - True/False or 1. + z\_score - True/False whether want or not apply z-score normalization -True is default. Whether or not apply z-score normalization. Z-score -normalization is quite common in heatmap for gene/isoform expression. Z -scores are: z = (x - mean)/std, so values in each row will get the mean -of the row subtracted, then divided by the standard deviation of the -row. This ensures that each row has mean of 0 and variance of 1. It is -possible apply the same normalization to the column giving z\_score = 1. + col\_cluster - True/False whether want or not cluster the samples -**col\_cluster** - True/False + method - clustering algorithm - default is complete-linkage -False is default. Whether or not cluster the samples. + cmap - map color -**method** - *str* + export - True/False whether want or not export the dataframe of -Linkage method to use for calculating clusters. Default is "complete", -because according to D'haeseleer P. Nat Biotechnol. 2005, complete -linkage is one of the best linkage methods for gene expression -clustering. See `seaborn.clustermap documentation to use other linkage -methods `__ + selected genes -**cmap** - *str* + \*\*options - all the options accepted by seaborn.clustermap -matplotlib colormap name or object, or list of colors. Default is -"seismic". Check -`here `__ -or `here `__ for more color -maps. See `seaborn.heatmap documentation for more -information `__ + default metric is euclidean. -**export** - True/False + Example -False is default. Whether or not export the heatmap as .png file. + ---------- -\*\*options Accept all the parameter and values accepted by -seaborn.clustermap. See `seaborn.clustermap documentation for more -information. `__ -i.e.: + Selection.heatmap(z\_score=True,export=True) -:: +lineplot(self, title='', legend=True, z\_score=False, export=False, +size=10, ci=None, \*\*option) - import papillon as pl + LinePlot selected genes expression levels. Max number of genes 200 - MyExp=pl.read_db("My Experiment") + Parameters - MyExp.get_isoform() + ---------- - MyExp.heatmap() + title - accept a str as title of the plot - MyExp.heatmap(z_score=False, export=True) + legend - True/False show the legend - MyExp.heatmap(z_score=True, col_cluster=True) + z\_score - True/False calculate the z-score normalization --------------- + export - True/False whether or not export the image -**self.lineplot(self, title="", legend=True, z_score=False, export=False, df=None, size=10, **option)** + size - change the size of the plot -Create a lineplot with self.selected. Max number of genes/isoforms is -200. + \*\*options - all the options accepted by seaborn.factorplot -**title** - *str* + Example -accept a string as title of the plot + ---------- -**legend** - True/False + Selection.lineplot(export=True,z\_score=True) -True is default. Whether show or not the legend. Default is True, if the -number of genes is higher than 50 is forced to False. +onlyFPKM(self, return\_as, remove\_FPKM\_name=False) -**z\_score** - True/False + Take a Papillon\_list object and return only FPKM columns. -False is default. Whether or not apply z-score normalization. Z-score -normalization. Z scores are: z = (x - mean)/std, so values in each row -will get the mean of the row subtracted, then divided by the standard -deviation of the row. This ensures that each row has mean of 0 and -variance of 1. + Parameters -**export** - True/False + ---------- -False is default. Whether or not export the plot as .png file. + return as: -**df** - *pandas.DataFrame* + "df" - pandas DataFrame -accept a dataframe different from self.selected. It should have a -papillon class structure. + "array" - numpy array -**size** - *int* Size of the figure to create. Default is 10 + "gene name" - pandas DataFrame containing gene names -\*\*options - all the options accepted by seaborn.factorplot. `See -seaborn.factorplot documentation for more -info. `__ + remove\_FPKM\_name: True/False -i.e.: + Example -:: + ---------- - import papillon as pl + df=Selection.onlyFPKM("df") - MyExp=pl.read_db("My Experiment") +plot(\*\*parameter) - MyExp.get_isoform() + Deprecated. Use self.lineplot() instead - MyExp.lineplot() +search(self, string) - MyExp.lineplot(title="My genes", legend=False) + Search a string in the Papillon\_list - MyExp.lineplot(z_score=True, export=True) + Parameters --------------- + ---------- -**self.plot** is deprecated + string - accept a str to search among the gene names --------------- + Example -**self.search(self, word, where, how="table", export=False)** + ---------- -search among gene/isoform names in detected and significant. + search\_results=Selection.search("IL") -**word** - *str* str to search among the gene names +select(self, genelist) -**where** - *"genes\_detected"*, *"genes\_significant"*, -*"isoforms\_detected"*, *"isoforms\_significant"* define where search -the word. + Create another Papillon\_list object -**how** - *"table"*, *"list"* or *"selected"* + Parameters -*"table"* return found names as pandas.DataFrame. + ---------- -*"list"* return found names as list with no duplicates. + genelist: accept string (1 gene name), list of gene names or file -*"selected"* select the genes/isoforms as self.selected, works only with -where="...significant" + with a list of gene names -**export** - True/False + Example -False is default. Whether or not export the found isoforms/genes as .xls -file. + ---------- -i.e.: + Selection2=Selection1.select(["IL6","CCL15"]) -:: +show(self) - import papillon as pl + Show genes/isoforms as Dataframe - MyExp=pl.read_db("My Experiment") + Example - MyExp.search(word="IL", where="genes_significant", export=True) + ---------- - MyExp.search(word="CCL", where="isoforms_detected", how="list") + Selection.show() diff --git a/docs/papillon.html b/docs/papillon.html new file mode 100644 index 0000000..69f5420 --- /dev/null +++ b/docs/papillon.html @@ -0,0 +1,325 @@ + +Python: module papillon + + + + + +
 
+ 
papillon
index
/home/lab/Documents/GitHub/Papillon/papillon.py
+

A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq data

+Before To start
+========

+After RNA-seq analysis with Galaxy,
+download the 4 files generated by cuffdiff containing respectively:

+... transcript_FPKM_tracking...

+... gene_FPKM_tracking ...

+... gene_differential_expression...

+... transcript_differential_expression...

+And put them in the same folder.

+You can either use directly the files or change the names according to
+cummeRbund in:

+... transcript\_FPKM\_tracking... = isoforms.fpkm_tracking

+... gene\_FPKM\_tracking ... = gene.fpkm_tracking

+... gene\_differential\_expression... = gene_exp.diff

+... transcript\_differential\_expression... = isoform_exp.diff

+

+ + + + + +
 
+Modules
       
os
+pandas
+
re
+seaborn
+
warnings
+

+ + + + + +
 
+Classes
       
+
builtins.object +
+
+
Papillon_db +
Papillon_list +
+
+
+

+ + + + + + + +
 
+class Papillon_db(builtins.object)
   Create a Papillon_db object using read_folder() or read_files() and _papillon_builder
 
 Methods defined here:
+
__init__(self, path, samples, comparisons, genes_detected, isoforms_detected)
self.path - files path
+self.samples - samples found
+self.comparisons - comparisons found
+self.genes_detected - dataframe of genes detected
+self.isoforms_detected - dataframe of isoforms detected
+ +
__str__(self)
Return str(self).
+ +
change_order(self, new_order)
Change the samples order

+Parameters
+----------
+new_order: list of samples order

+Example
+----------
+MyProject.change_order(["Sample 4","Sample 3","Sample 2","Sample 1"])
+ +
drop_comparison(self, comparison)
Drop Comparison (str) or list of comparisons

+Parameters
+----------
+comparison: comparison (str) or list of comparisons

+Example
+----------
+MyProject.drop_comparison(specific_comparison)
+ +
get_gene(self, genelist=None, comparison='all', comparison_sign=None, fold_ind=None, fold_sign='>')
This method select genes per name or conditions. It return a Papillon_list object

+Parameters
+----------
+genelist - accept string (1 gene name), list of gene names or file with a list of gene names        
+comparison - To select genes higher/lower in one condition compared to another. Accept either "all" to pass all the comparisons, or accept only 1 comparison as str (already present in the data)
+comparison_sign - usable in combination with comparison, accept either ">" or "<"
+fold_ind - fold induction (log2) higher/lower then number
+fold_sign - usable in combination with fold_ind, accept either ">" or "<"

+Example
+----------
+Selection=MyProject.get_gene(["CD44","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign="<")
+ +
get_isoform(self, genelist=None, comparison='all', comparison_sign=None, fold_ind=None, fold_sign='>')
This function select isoforms. It creates a Papillon object

+Parameters
+----------
+genelist - accept string (1 gene name), list of gene names or file with a list of gene names        
+comparison - To select genes higher/lower in one condition compared to another. Accept either "all" to pass all the comparisons, or accept only 1 comparison as str (already present in the data)
+comparison_sign - usable in combination with comparison, accept either ">" or "<"
+fold_ind - fold induction (log2) higher/lower then number
+fold_sign - usable in combination with fold_ind, accept either ">" or "<"

+Example
+----------        
+Selection=MyProject.get_isoform(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign="<")
+ +
search(self, word, where, how='table', export=False)
search among genes/isoforms names in detected and significant

+Parameters
+----------
+word - accept a str to search among the gene names
+where - accept:
+    "genes_detected"
+    "genes_significant"
+    "isoforms_detected"
+    "isoforms_significant"

+how - accept:
+    "table" return the dataframe with the genes found
+    "list" return a list of names, no duplicates
+    
+export - True/False
+    
+Example
+----------
+search_result=MyProject.search(word="IL6",where="genes_significant", how="table")
+ +
+Data descriptors defined here:
+
__dict__
+
dictionary for instance variables (if defined)
+
+
__weakref__
+
list of weak references to the object (if defined)
+
+

+ + + + + + + +
 
+class Papillon_list(builtins.object)
   Class containing a selected list of genes, with data associated, generated by Papillon_db.get_gene() or Papillon_db.get_isoform() methods

+Is possible add the content of two or more Papillon_list objects.
+Example:
+P_list3 = P_list1+P_list2
+P_list4=sum([PL1, PL2, PL3])
 
 Methods defined here:
+
__add__(self, other)
+ +
__init__(self, df, what, comparisons, path, samples, comparison='all', comparison_sign=None, fold_ind=None, fold_sign='>', p=0.05)
Initialize self.  See help(type(self)) for accurate signature.
+ +
__radd__(self, other)
+ +
__str__(self)
Return str(self).
+ +
compare(self, other)
Compare two Papillon_list objects

+Parameters
+----------
+other: another Papillon_list object

+Example
+----------
+Selection1.compare(Selection2)
+ +
export(self)
Export the selected genes/isoforms as excel file.

+Example
+----------
+Selection.export()
+ +
heatmap(self, z_score=False, col_cluster=False, method='complete', cmap='seismic', export=False, **options)
Generate heatmap using selected genes/isoforms

+Parameters
+----------
+z_score - True/False whether want or not apply z-score normalization
+col_cluster - True/False whether want or not cluster the samples
+method - clustering algorithm - default is complete-linkage
+cmap - map color
+export - True/False whether want or not export the dataframe of
+         selected genes
+**options - all the options accepted by seaborn.clustermap
+default metric is euclidean.

+Example
+----------
+Selection.heatmap(z_score=True,export=True)
+ +
lineplot(self, title='', legend=True, z_score=False, export=False, size=10, ci=None, **option)
LinePlot selected genes expression levels. Max number of genes 200

+Parameters
+----------
+title - accept a str as title of the plot
+legend - True/False show the legend
+z_score - True/False calculate the z-score normalization
+export - True/False whether or not export the image
+size - change the size of the plot
+**options - all the options accepted by seaborn.factorplot

+Example
+----------
+Selection.lineplot(export=True,z_score=True)
+ +
onlyFPKM(self, return_as, remove_FPKM_name=False)
Take a Papillon_list object and return only FPKM columns.

+Parameters
+----------
+return as:
+    "df" - pandas DataFrame
+    "array" - numpy array
+    "gene name" - pandas DataFrame containing gene names


+remove_FPKM_name: True/False

+Example
+----------
+df=Selection.onlyFPKM("df")
+ +
plot(**parameter)
Deprecated. Use self.lineplot() instead
+ +
search(self, string)
Search a string in the Papillon_list

+Parameters
+----------
+string - accept a str to search among the gene names

+Example
+----------
+search_results=Selection.search("IL")
+ +
select(self, genelist)
Create another Papillon_list object

+Parameters
+----------
+genelist: accept string (1 gene name), list of gene names or file
+           with a list of gene names

+Example
+----------
+Selection2=Selection1.select(["IL6","CCL15"])
+ +
show(self)
Show genes/isoforms as Dataframe

+Example
+----------
+Selection.show()
+ +
+Data descriptors defined here:
+
__dict__
+
dictionary for instance variables (if defined)
+
+
__weakref__
+
list of weak references to the object (if defined)
+
+

+ + + + + +
 
+Functions
       
read_db(path, drop_comparison=None)
Deprecated. Use read_folder() instead.
+
read_files(files, path=None, drop_comparison=None)
Accept cuffdiff/cummeRbund files as iterable
+    ("transcript_FPKM_tracking",
+    "gene_FPKM_tracking",
+    "gene_differential_expression",
+    "transcript_differential_expression")
+and return them to _papillon_builder() to create a Papillon_db object.

+Parameters
+----------

+files - accept an iterable with the cuffdiff files names
+path - where to export Papillon generated files
+drop_comparison - drop comparison (str) or list of comparisons to drop from the cuffdiff table

+Example
+----------
+pp.read_files(["Files/gene_exp.diff","Files/genes.fpkm_tracking","Files/isoform_exp.diff","Files/isoforms.fpkm_tracking"])
+
read_folder(path, drop_comparison=None)
Read the folder containing the cuffdiff/cummeRbund files, and return
+them to _papillon_builder() to create a Papillon_db object.


+Parameters
+----------
+path - accept a str with the folder path, containing the cuffdiff files
+drop_comparison - drop comparison (str) or list of comparisons to drop from the cuffdiff table

+Example
+----------
+MyProject=pp.read_folder("MyFolder/Test_files")
+
+ \ No newline at end of file diff --git a/docs/pydoc papillon.html b/docs/pydoc papillon.html deleted file mode 100644 index 82183ae..0000000 --- a/docs/pydoc papillon.html +++ /dev/null @@ -1,233 +0,0 @@ - -Python: module papillon - - - - - -
 
- 
papillon
index
/home/lab/Documents/GitHub/Papillon/papillon.py
-

A python version of CummeRbund
-to read and plot Galaxy/cuffdiff RNA-seq data

-

- - - - - -
 
-Modules
       
os
-
pandas
-
seaborn
-
warnings
-

- - - - - -
 
-Classes
       
-
builtins.object -
-
-
Papillon_db -
-
-
Papillon -
-
-
-
-
-

- - - - - - - -
 
-class Papillon(Papillon_db)
   Select and plot genes/isoforms from a Papillon_db

-self.selected - gene/isoform selected
-self.type_selected - either gene or isoform according with selection type
 
 
Method resolution order:
-
Papillon
-
Papillon_db
-
builtins.object
-
-
-Methods defined here:
-
get_gene(self, genelist=None, comparison=None, sign=None, export=False)
This function select genes. It creates

-self.selected
-self.type_selected="gene".

-genelist - accept string (gene name), list of gene names or file
-           with a list of gene names
-comparison - accept only 1 comparison as str (already present in
-             the data)
-sign - usable in combination with comparison, accept either ">" or
-       "<"
-export - True/False whether want or not export the dataframe of
-         selected genes
- -
get_isoform(self, genelist=None, comparison=None, sign=None, export=False, show_dup=False)
This function select isoforms. It creates
-self.selected
-self.type_selected="isoform"

-genelist - accept string (gene name), list of gene names or file
-           with a list of gene names
-comparison - accept only 1 comparison as str (already present in
-             the data)
-sign - usable in combination with comparison, accept either ">" or
-       "<"
-export - True/False whether want or not export the dataframe of
-         selected genes
-show_dup - True/False whether want or not highlight duplicated
-           isoforms for the same gene
- -
heatmap(self, z_score=True, col_cluster=False, method='complete', cmap='seismic', export=False, **options)
Generate heatmap using selected genes/isoforms
-z_score - True/False whether want or not apply z-score normalization
-col_cluster - True/False whether want or not cluster the samples
-method - clustering algorithm - default is complete-linkage
-cmap - map color
-export - True/False whether want or not export the dataframe of
-         selected genes
-**options - all the options accepted by seaborn.clustermap
-default metric is euclidean.
- -
lineplot(self, title='', legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option)
LinePlot selected genes expression levels. Max number of genes 200

-title - accept a str as title of the plot
-legend - True/False show the legend
-z_score - True/False calculate the z-score normalization
-export - True/False whether or not export the image
-df - accept an exernal dataframe, different from self.selected
-**options - all the options accepted by seaborn.factorplot
- -
onlyFPKM(self, return_as, **option)
It uses self.selected or an extra_df and Return only FPKM columns.

-return as:
-    "df" - pandas DataFrame
-    "array" - numpy array
-    "gene name" - pandas DataFrame containing gene names

-**option accept extra_df as exernal Pandas df
- -
plot(self, title='', legend=True, z_score=False, export=False, df=None, size=10, ci=None, **option)
Use self.lineplot() instead. self.plot() will not work in the future
- -
search(self, word, where, how='table', export=False)
search among genes/isoforms names in detected and significant

-word - accept a str to search among the gene names
-where - accept:
-    "genes_detected"
-    "genes_significant"
-    "isoforms_detected"
-    "isoforms_significant"

-how - accept:
-    "table" return the dataframe with the genes found
-    "list" return a list of names, no duplicates
-    "selected" put the genes found among the differential expressed
-               genes in self.selected (to plot),
-               working only with where="significant"
- -
-Methods inherited from Papillon_db:
-
__init__(self, path, samples, comparisons, genes_detected, genes_significant, isoforms_detected, isoform_significant)
Initialize self.  See help(type(self)) for accurate signature.
- -
__str__(self)
Return str(self).
- -
change_order(self, new_order)
Change the samples order

-new_order: list of samples order
- -
dropComparison(self, comparison)
Drop Comparison (str) or list of comparisons and re-calculate
-df_significant

-comparison: comparison (str) or list of comparisons
- -
selected_exist(self, remove=False)
Check if self.selected exists

-remove: True/False. If True remove self.selected and self.type_selected
- -
-Data descriptors inherited from Papillon_db:
-
__dict__
-
dictionary for instance variables (if defined)
-
-
__weakref__
-
list of weak references to the object (if defined)
-
-

- - - - - - - -
 
-class Papillon_db(builtins.object)
   Make a Papillon_db object and permit to change some values

-self.path - files path
-self.samples - samples found
-self.comparison - comparisons found
-self.genes_detect - dataframe of genes detected
-self.genes_significant - dataframe of genes significant
-self.isoforms_detect - dataframe of isoforms detected
-self.isoforms_significant - dataframe of isoforms significant
-expressed
-redefine __str__
 
 Methods defined here:
-
__init__(self, path, samples, comparisons, genes_detected, genes_significant, isoforms_detected, isoform_significant)
Initialize self.  See help(type(self)) for accurate signature.
- -
__str__(self)
Return str(self).
- -
change_order(self, new_order)
Change the samples order

-new_order: list of samples order
- -
dropComparison(self, comparison)
Drop Comparison (str) or list of comparisons and re-calculate
-df_significant

-comparison: comparison (str) or list of comparisons
- -
selected_exist(self, remove=False)
Check if self.selected exists

-remove: True/False. If True remove self.selected and self.type_selected
- -
-Data descriptors defined here:
-
__dict__
-
dictionary for instance variables (if defined)
-
-
__weakref__
-
list of weak references to the object (if defined)
-
-

- - - - - -
 
-Functions
       
read_db(path, drop_comparison=None)
Use read_folder() instead. read_db() will not work in the future
-
read_files(files, path=None, drop_comparison=None)
Accept cuffdiff/cummeRbund files as iterable, and return
-them to _papillon_builder().

-files - accept an iterable with the cuffdiff files
-path - where export Papillon generated files
-drop_comparison - drop comparison (str) or list of comparisons and
-re-calculate significant genes/isoforms
-
read_folder(path, drop_comparison=None)
Read the folder containing the cuffdiff/cummeRbund files, and return
-them to _papillon_builder().

-path - accept a str with the folder path, containing the cuffdiff files
-drop_comparison - drop comparison (str) or list of comparisons and
-re-calculate significant genes/isoforms
-
- \ No newline at end of file diff --git a/papillon.py b/papillon.py index 646c9cf..777da80 100644 --- a/papillon.py +++ b/papillon.py @@ -1,33 +1,73 @@ # -*- coding: utf-8 -*- -"""A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq data""" +"""A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq data + +Before To start +======== + +After RNA-seq analysis with Galaxy, +download the 4 files generated by cuffdiff containing respectively: + +... transcript_FPKM_tracking... + +... gene_FPKM_tracking ... + +... gene_differential_expression... + +... transcript_differential_expression... + +And put them in the same folder. + +You can either use directly the files or change the names according to +cummeRbund in: + +... transcript\_FPKM\_tracking... = isoforms.fpkm_tracking + +... gene\_FPKM\_tracking ... = gene.fpkm_tracking + +... gene\_differential\_expression... = gene_exp.diff + +... transcript\_differential\_expression... = isoform_exp.diff + +""" import os import warnings from distutils.version import LooseVersion import pandas as pd import seaborn as sns +import re warnings.simplefilter('default', DeprecationWarning) -if LooseVersion(pd.__version__) < LooseVersion("0.17.1"): - raise Exception("Pandas >= 0.17.1 required") +if LooseVersion(pd.__version__) < LooseVersion("0.23"): + raise Exception("Pandas >= 0.23 required") if LooseVersion(sns.__version__) < LooseVersion("0.8.1"): raise Exception("Seaborn >= 0.8.1 required") def read_db(path, drop_comparison=None): - """Deprecated. Use read_folder()""" + """Deprecated. Use read_folder() instead.""" warnings.warn( "read_db() deprecated. Use read_folder() instead.", DeprecationWarning) def read_folder(path, drop_comparison=None): """Read the folder containing the cuffdiff/cummeRbund files, and return - them to _papillon_builder(). - + them to _papillon_builder() to create a Papillon_db object. + + + Parameters + ---------- path - accept a str with the folder path, containing the cuffdiff files - drop_comparison - drop comparison (str) or list of comparisons""" + drop_comparison - drop comparison (str) or list of comparisons to drop from the cuffdiff table + + Example + ---------- + MyProject=pp.read_folder("MyFolder/Test_files") + + """ + if drop_comparison is None: drop_comparison = [] try: @@ -69,12 +109,26 @@ def read_folder(path, drop_comparison=None): def read_files(files, path=None, drop_comparison=None): - """Accept cuffdiff/cummeRbund files as iterable, and return - them to _papillon_builder(). + """Accept cuffdiff/cummeRbund files as iterable + ("transcript_FPKM_tracking", + "gene_FPKM_tracking", + "gene_differential_expression", + "transcript_differential_expression") + and return them to _papillon_builder() to create a Papillon_db object. + + Parameters + ---------- + + files - accept an iterable with the cuffdiff files names + path - where to export Papillon generated files + drop_comparison - drop comparison (str) or list of comparisons to drop from the cuffdiff table + + Example + ---------- + pp.read_files(["Files/gene_exp.diff","Files/genes.fpkm_tracking","Files/isoform_exp.diff","Files/isoforms.fpkm_tracking"]) + """ + - files - accept an iterable with the cuffdiff files - path - where export Papillon generated files - drop_comparison - drop comparison (str) or list of comparisons""" if drop_comparison is None: drop_comparison = [] @@ -269,17 +323,16 @@ def _obtain_list(genelist, path): # TO DO - eventually remove empty one # self.gene_diff class Papillon_db: - """Make a Papillon_db object and permit to change some values - - self.path - files path - self.samples - samples found - self.comparisons - comparisons found - self.genes_detected - dataframe of genes detected - self.isoforms_detected - dataframe of isoforms detected - redefine __str__""" + """Create a Papillon_db object using read_folder() or read_files() and _papillon_builder""" def __init__(self, path, samples, comparisons, genes_detected, isoforms_detected):#, genes_significant, isoforms_detected, isoform_significant): + """self.path - files path + self.samples - samples found + self.comparisons - comparisons found + self.genes_detected - dataframe of genes detected + self.isoforms_detected - dataframe of isoforms detected + """ self.path = path self.samples = samples self.comparisons = comparisons @@ -301,28 +354,45 @@ def __str__(self): return visual def drop_comparison(self, comparison): - """Drop Comparison (str) or list of comparisons and re-calculate - df_significant + """Drop Comparison (str) or list of comparisons + + Parameters + ---------- + comparison: comparison (str) or list of comparisons - comparison: comparison (str) or list of comparisons""" + Example + ---------- + MyProject.drop_comparison(specific_comparison) + """ self = self.Manipulate.dropComparison(self, comparison) def change_order(self, new_order): """Change the samples order - new_order: list of samples order""" + Parameters + ---------- + new_order: list of samples order + + Example + ---------- + MyProject.change_order(["Sample 4","Sample 3","Sample 2","Sample 1"]) + """ self = self.Manipulate.change_order(self, new_order) def get_gene(self, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): - """This function select genes. It return a Papillon_list object - - genelist - accept string (1 gene name), list of gene names or file - with a list of gene names - comparison - accept only 1 comparison as str (already present in - the data) - sign - usable in combination with comparison, accept either ">" or - "<" - fold_ind - fold induction (log2) higher then number + """This method select genes per name or conditions. It return a Papillon_list object + + Parameters + ---------- + genelist - accept string (1 gene name), list of gene names or file with a list of gene names + comparison - To select genes higher/lower in one condition compared to another. Accept either "all" to pass all the comparisons, or accept only 1 comparison as str (already present in the data) + comparison_sign - usable in combination with comparison, accept either ">" or "<" + fold_ind - fold induction (log2) higher/lower then number + fold_sign - usable in combination with fold_ind, accept either ">" or "<" + + Example + ---------- + Selection=MyProject.get_gene(["CD44","CCL15"], comparison="Sample 1_vs_Sample 2", comparison_sign="<", fold_ind=1, fold_sign="<") """ try: fold_ind=float(fold_ind) @@ -332,13 +402,18 @@ def get_gene(self, genelist=None, comparison="all", comparison_sign=None, fold_i def get_isoform(self, genelist=None, comparison="all", comparison_sign=None, fold_ind=None, fold_sign=">"): """This function select isoforms. It creates a Papillon object - genelist - accept string (gene name), list of gene names or file - with a list of gene names - comparison - accept only 1 comparison as str (already present in - the data) - sign - usable in combination with comparison, accept either ">" or - "<" - fold_ind - fold induction (log2) higher then number""" + Parameters + ---------- + genelist - accept string (1 gene name), list of gene names or file with a list of gene names + comparison - To select genes higher/lower in one condition compared to another. Accept either "all" to pass all the comparisons, or accept only 1 comparison as str (already present in the data) + comparison_sign - usable in combination with comparison, accept either ">" or "<" + fold_ind - fold induction (log2) higher/lower then number + fold_sign - usable in combination with fold_ind, accept either ">" or "<" + + Example + ---------- + Selection=MyProject.get_isoform(["IL6","CCL15","IL17RC"], comparison="Sample 2_vs_Sample 4", comparison_sign="<") + """ try: fold_ind=float(fold_ind) @@ -348,6 +423,8 @@ def get_isoform(self, genelist=None, comparison="all", comparison_sign=None, fol def search(self, word, where, how="table", export=False): """search among genes/isoforms names in detected and significant + Parameters + ---------- word - accept a str to search among the gene names where - accept: "genes_detected" @@ -357,7 +434,14 @@ def search(self, word, where, how="table", export=False): how - accept: "table" return the dataframe with the genes found - "list" return a list of names, no duplicates""" + "list" return a list of names, no duplicates + + export - True/False + + Example + ---------- + search_result=MyProject.search(word="IL6",where="genes_significant", how="table") + """ # "selected" put the genes found among the differential expressed # genes in self.selected (to plot), # working only with where="significant" """ @@ -400,8 +484,7 @@ def _compare(self, pp): return genes_not_found, isoforms_not_found, n # Only for tests so far. def dropComparison(self, pp, comparison): - """Drop Comparison (str) or list of comparisons and re-calculate - df_significant + """Drop Comparison (str) or list of comparisons comparison: comparison (str) or list of comparisons """ @@ -500,14 +583,17 @@ def df_or_list(df_, how_): names = list(set(df_["gene_short_name"])) print(names) return names +# elif how == "selected": +# pass + else: raise # Checking input if where not in ["genes_detected", "genes_significant", "isoforms_detected", "isoforms_significant"]: raise Exception("where= not known") elif how not in ["table", "list", "selected"]: raise Exception("how= not known") - else: - pass +# else: +# pass word1, word2 = where.split("_") if word1 == "genes": df = pp.genes_detected[pp.genes_detected["gene_short_name"].str.contains( @@ -532,10 +618,11 @@ def df_or_list(df_, how_): print(len(df_sig), " differential expressed.") if how == "selected": - if word2 != "significant": - raise Exception( - 'how == "selected", but only significant genes/isoforms can be selected.') - elif word2 == "significant": + if word1 == "genes": +# pass +# raise Exception( +# 'how == "selected", but only significant genes/isoforms can be selected.') +# elif word2 == "significant": found = df_sig.copy() self.selected = found.copy() elif word2 == "detected": @@ -601,11 +688,16 @@ def get_isoform(self, pp, genelist=None, comparison="all", comparison_sign=None, class Papillon_list: - """Class containing a selected list of genes, with data associated + """Class containing a selected list of genes, with data associated, generated by Papillon_db.get_gene() or Papillon_db.get_isoform() methods - df should have: - [isoform_id(optional?)] [gene_id] [gene_name] [At least 2 readcounts - n] [at least 1 vs condition (n-1+n-2+n-3...ecc)] [at least 1 q_value(n-1+n-2+n-3...ecc)] #check if p_value>q_value - """ + Is possible add the content of two or more Papillon_list objects. + Example: + P_list3 = P_list1+P_list2 + P_list4=sum([PL1, PL2, PL3]) + """ +# df should have: +# [isoform_id(optional?)] [gene_id] [gene_name] [At least 2 readcounts - n] [at least 1 vs condition (n-1+n-2+n-3...ecc)] [at least 1 q_value(n-1+n-2+n-3...ecc)] #check if p_value>q_value + def __init__(self, df, what, comparisons, path, samples, comparison="all" , comparison_sign=None, fold_ind=None, fold_sign=">",p=0.05): self.Manipulate = _Manipulate_list() self.df=df @@ -656,36 +748,94 @@ def __radd__(self, other): return self.__add__(other) # sum([T1, T2, T3]) def show(self): #to_improve - list gene, df - """Show genes/isoforms as Dataframe """ + """Show genes/isoforms as Dataframe + + Example + ---------- + Selection.show() + """ self.Manipulate.show(self) -# def select(self): TO DO -# pass - #create another Papillon_list object - + def select(self, genelist): + """Create another Papillon_list object + + Parameters + ---------- + genelist: accept string (1 gene name), list of gene names or file + with a list of gene names + + Example + ---------- + Selection2=Selection1.select(["IL6","CCL15"]) + """ + new_pp=self.Manipulate.select(self, genelist) + return new_pp + def export(self): - """ Export the selected genes/isoforms as excel file.""" + """ Export the selected genes/isoforms as excel file. + + Example + ---------- + Selection.export() + """ self.Manipulate.export(self) def compare(self, other): - """ Compare two papillon objects""" + """ Compare two Papillon_list objects + + Parameters + ---------- + other: another Papillon_list object + + Example + ---------- + Selection1.compare(Selection2) + + """ self.Manipulate.compare(self, other) + def search(self, string): + """Search a string in the Papillon_list + + Parameters + ---------- + string - accept a str to search among the gene names + + Example + ---------- + search_results=Selection.search("IL") + """ + new_pp=self.Manipulate.search(self, string) + return new_pp + def onlyFPKM(self, return_as, remove_FPKM_name=False): - """Take a Papillon dataframe and a list of samples, return only FPKM columns. + """Take a Papillon_list object and return only FPKM columns. + Parameters + ---------- return as: "df" - pandas DataFrame "array" - numpy array - "gene name" - pandas DataFrame containing gene names""" + "gene name" - pandas DataFrame containing gene names + + + remove_FPKM_name: True/False + + Example + ---------- + df=Selection.onlyFPKM("df") + """ return self.plot.onlyFPKM(self.df, self.samples, return_as, remove_FPKM_name) def plot(**parameter): - """Deprecated. Use self.lineplot()""" + """Deprecated. Use self.lineplot() instead""" warnings.warn('self.plot() is deprecated. Use self.lineplot() instead.', DeprecationWarning) def heatmap(self, z_score=False, col_cluster=False, method="complete", cmap="seismic", export=False, **options): """Generate heatmap using selected genes/isoforms + + Parameters + ---------- z_score - True/False whether want or not apply z-score normalization col_cluster - True/False whether want or not cluster the samples method - clustering algorithm - default is complete-linkage @@ -694,18 +844,29 @@ def heatmap(self, z_score=False, col_cluster=False, method="complete", cmap="sei selected genes **options - all the options accepted by seaborn.clustermap default metric is euclidean. + + Example + ---------- + Selection.heatmap(z_score=True,export=True) """ self.plot.heatmap(self, z_score, col_cluster, method, cmap, export, **options) def lineplot(self, title="", legend=True, z_score=False, export=False, size=10, ci=None, **option): """LinePlot selected genes expression levels. Max number of genes 200 + Parameters + ---------- title - accept a str as title of the plot legend - True/False show the legend z_score - True/False calculate the z-score normalization export - True/False whether or not export the image - df - accept an exernal dataframe, different from self.selected - **options - all the options accepted by seaborn.factorplot""" + size - change the size of the plot + **options - all the options accepted by seaborn.factorplot + + Example + ---------- + Selection.lineplot(export=True,z_score=True) + """ self.plot.lineplot(self, title, legend, z_score, export, size, ci, **option) @@ -795,8 +956,8 @@ def compare(self, pp, other, return_="common"): #to do, show list gene, return l if ele not in common: onlyB.append(ele) print("Number of ",pp.what,"in common: ",len(common)) - print("Number of ",pp.what,"in the first Papillion_list: ",len(A),". Not in common with the second:",len(onlyA)) - print("Number of ",other.what,"in the second Papillion_list: ",len(B),". Not in common with the second:",len(onlyB)) + print("Number of ",pp.what,"in the first Papillon_list: ",len(A),". Not in common with the second:",len(onlyA)) + print("Number of ",other.what,"in the second Papillon_list: ",len(B),". Not in common with the second:",len(onlyB)) # if return_==common: # pass @@ -806,9 +967,39 @@ def export(self, pp, name="Table"): pp.df.to_excel(str(pp.path + name + '.xls'), sheet_name='Sheet1') print("\nExported as " + name + ".xls\n") - def search(self): - pass - #TO DO + def search(self, pp, string): + a=pp.df[pp.df.index.str.contains(string,flags=re.IGNORECASE)] + b=pp.df[pp.df.gene_short_name.str.contains(string,flags=re.IGNORECASE)] + c=pp.df[pp.df.gene_id.str.contains(string,flags=re.IGNORECASE)] + a.reset_index(inplace=True) + b.reset_index(inplace=True) + c.reset_index(inplace=True) + df=a.merge(b, how="outer") + df=df.merge(c, how="outer") + df.set_index("tracking_id",inplace=True) + print("\nFound:\n",df) +# print() + return Papillon_list(df, what=pp.what, comparisons=pp.comparison, comparison=None, path=pp.path, samples=pp.samples) + + def select(self, pp, genelist): #TO DO: BE TESTED!!!! + """Create another Papillon_list object """ + if isinstance(genelist, Papillon_list): + genelist = set(genelist.df["gene_short_name"].tolist()) + else: + genelist = _obtain_list(genelist, path=pp.path) +# df = self._select(pp, genelist, "gene") # To do - Return number genes not found? + + df = pd.DataFrame.copy(pp.df) + n=0 + if genelist != []: + df["Selected"] = [True if name in genelist else False for name in df["gene_short_name"]] + df = df[df["Selected"] == True].iloc[:, :-1] + for name in genelist: + if name not in df["gene_short_name"].tolist(): + print("Gene name not found:\t", name) # TO DO: return not found list ? + n+=1 + print("Number of gene not found: ",n) + return Papillon_list(df, what=pp.what, comparisons=pp.comparison, comparison=None, path=pp.path, samples=pp.samples) class _Plot: @@ -921,7 +1112,7 @@ def lineplot(self, pp, title, legend, z_score, export, size, ci, **option): legend - True/False show the legend z_score - True/False calculate the z-score normalization export - True/False whether or not export the image - df - accept an exernal dataframe, different from self.selected + **options - all the options accepted by seaborn.factorplot""" def subplot(df, pp, title="", legend=True, z_score=False, export=False, size=10, ci=None, **option): @@ -975,4 +1166,4 @@ def export(self, thing, path, export, name=None, image_extension=".png"): # add thing.savefig(str(path + name + image_extension)) print("\nExported as " + name + image_extension) else: - raise Exception("export= can be only 'False' or 'True'") \ No newline at end of file + raise Exception("export= can be only 'False' or 'True'") diff --git a/test/Test_files/Papillon/Table gene All them.xls b/test/Test_files/Papillon/Table gene All them.xls new file mode 100644 index 0000000000000000000000000000000000000000..185e548ee49d328aee439516cfba24d08ed2dd4d GIT binary patch literal 10240 zcmeHN3viUx6+ZvIvIz;2)Hb5MJr=A!1I_C%%4W3|wPKSz&!LsfGI>SQ54 z%~B$F$^zUM%eP?7D=&U{jCC@vgwvWYu7v*MXl{(3n1PR*EB0IGKiiE&?sCd6^Ew#v zD)x_xO}038<0^6t^YA-V5 zC*Iq}hs!YTSV^Z3M$BmmAI~hYc^;+r`0TWJC+u#}lRqpdSmRiO0M+`? zAC#USChOyh;V+taM_xh9e-wQTZ_4RHpDI5_4>{{?7(Ogf-aTQvx)+39>RuRjsJl0u zrS84L&gj#~D>_=woH;FO^Jre8=Ma&^U`rXMq9ICNlmAo)8MDq$HXvo(A{%ABJ5x~(1cp*315s|V`RLVL!fCn{e z1Qq&(LJ?_cX%P^JZ2AW7_@YKs4WSKQAik+`)@o#|x4D%Z>utRgpN&|Z|cRO;PxT~c2Eb=*b zX>RP++(oGW74ck@9k!9c4?x9Eu6c38ta*(K=F~UN#+Y-EOX`=*UAVZpenG?hAhL_X zG|q;U=?(K2H3dsX)z?h>YHb28O@Yf&;Bo~cH^uoMTT*YfOK%c+bi`u~O-q9@)TsL8 z?oeZW)51BWX}i-4E$dz=ySouumR@Li_d?m-jnMLh&{E6urO`CCJ4I+xcY2{|G_`W2 z(KM+$A+*e*X&Rxa-6@(Tb>l=adCKj$HZBOkkta_XH5#|-N#*76z}o3E>h2mj>bCOH zb^JL5^(&g3G9 zW+S+`0Vq%blHq%4m(=XElg%e4=d5L+c8ve3{%%tinbRPU{DOI z9yHAtZ`M4a`kqvDgCGqF0Oe5St(Nv+v`74Y6~2G19D=s%p)Jmbw!ByQ5*{xbIvVQp zgbX)^8_!2reo+~_8*(3HXUlRm3CLY}d?HaW`ED*qz&7+)`|U_QPEVpg3u^z#mw=f@ z0{s$jHpXeaDFNre09N`QNFrAHTuM=OX?slcs3Rd8GtyThHQX1qK52%IF^PS zfH;bVoPa1_LoPrFLlepZgjh5oHy}ovrpyM!R{|Q!0mP`&P!B+SO`xG%Knzz6C2Yz` zLI?XC_~U?9lZH-G_wuQJ`>Qs?N%}%gR=Niun#QJxEG$B?2e2u>VFYU>vcbF~bnd|I z6_-tM8GVPoVq-dwKfevW5|AIQSoIy>(yS`2O-6sw>@ok)qvwPD=lzEac1Pq5Qd3Bs;ypMYq4LLAzzY=2MHS-1C8MA_A@Y;P-=t&~`Zv2; z?r+`xPjmEcZEfc|Z(*mce|l_f$H2-QbM$U)ZRfIYVW+Ks^=eV|*n4g5jB_rJKBO0I zYlpIRaU7zp*+*L=-2-}W%Fg#csIgaR4CDO5i*vS~T;eb>wDoZVbuII{Dj^k?#+7Zd6VCVbYC&AbLY^aW2Oojjju(} z`0Mwy?Tu_YYKl<%z8|hXT(R?aD%iOr@=1Tw&9v^oLo|((Mo2U>n!_Be*LOA@#E}sz*A2(Ybj$GQfxqxr^IkkgU?tIFOLs5%4atN zbdlKaxbz1G3t@&O?>EX9djqB8#}aTn(Hcpetl&1hI1FD&h@;q&Mm`k z-TFQkj&Gwsta!?GZ#>n$w3ms4I)(q8! zAPs`g&sSwEPMI!C@%;1gt2h7qF&X~?2`QTp(~Muk7Xp*H3^vE-x&l$(AyxO literal 0 HcmV?d00001 diff --git a/test/Test_files/Papillon/Table isoform all them.xls b/test/Test_files/Papillon/Table isoform all them.xls new file mode 100644 index 0000000000000000000000000000000000000000..d05fe1498cfe1b5691a65b931ed57930e9fc6cc5 GIT binary patch literal 20992 zcmeI43wTu3wa52lCdo{aN#>DxV-DUVlR^@P$@aKLHF-D5L=9#T0@RLKI3Ulu{_85T;O0p@IT+0TrbXqY$T1Ng+WY zNg+icO`(cHHH8`qwG;+WID*1J3Uw3)Q5a0&YZTCr^e|kD{|z z7Src4buP91;lGm9W2`rRIGR>}aX9J!RGNR3Z??dnYAal~wtpx{R$Nr5{xjSMlV{t1 zRTZeMfyd}wsXtKoj#>)eFr}??UfuuI>o~4Mn-6v9p+RO6eei-K>o9a~g zSIIMet8P*soj%u^~!^4xKIH&Y( za^&ii{_j`5iQ2Rdx2L`Ib{-||a3Ehk!bN_Ri+rSuJme@Zq@LhIU;QK+-9B(T4Vh9O z#qv6m+v{V<4{7p1II8y_zAs%)`K&Dk;?G!DfIs71#r#>>RhYY$ zJb~C3`5ML3J?$;`KG_f?B=17O+FKWWb|FGvdU|^3qf2#n=TvvLgOc{CK1$y~|8gW( z9Te|FhEyf|!&lCabKpEs3MsK(AL-|E-Kv}7xo*;>d~D%+7$*}guKQ)+~63LTVv zN4HwHZk?h8D%(whEd8g@&4FauZk1`CpuWdm<`VKT6EW!_Z#d`s?d>ZvSE+zHivB(3 z3sO+(wOuwd&01>ekZJzqyqelBQ$IUt#@zb)`f>I3W3sd&IPsg~=zZbh z4&zGG_@*(9)X7i$W~NDmndXK}bJLgxS}ZJE+}6<0oEb;gM_ZM~WhgrmPMO-+NNWc! zBU4|W$&Q~eCSwy!$m9zeEP{smhNh;be8D&toQ8^zYtD?%PN=t~$Tp12j>*uHhN|2= zdkR&0#+;L9e|=1*zOl)yO+!7UYs^=qva~zTHZ*4&#^|@)lx-YGw?tLfi_K;mn{D+V zf!v&#qDk_%2;^QY0{gvWvrYM#HjU3Kr76D@s#r!?WSTOKT8-)MCX6F;w7Q!zh0u{- z00m7tx`8|h%SSilXtg(4ZK8x&?ad1&D9{(}h1DHi1@MkCb&S}ij%DQj=962*WGl+9 zFhUOR(CzO%JLAk*Epu7k#S2T$M-9a=M2>wmU3n}pk~K#0`-PmglBPnGV%U^nvl7PF z<1gZC7jwE{lv|cx&$SGM=h`qskJx4zJ_Q4OX=?R=^q{#~Sw z@*U;7z8kNHTl7z#I)(BMsq?uJlzP1acceHDtzl6!UO-oNhRwL2291)4884=4?@pL; z3;?wQ%y@{(xu@2Qmk0*P`=Ctbk(ZJreU% z4E;u80g9o=NUV@z=qnN{q8NIK#EL0~{voj-#n3Y(7NQvXgv3fHhTb5tQi`D;NUV%v z=m8Q7Qw;1SR!%WcmskbGz*}Mwih;Dmq7(yTiNz=ex)O_13|u8vNih(W7_1pAC6=V` zpeV5v#lTNuX^MdyVra<(wYeVikrrslwE``+jq2c|mQ8Y7C%FXz%1<^=OK*~nX->bj z_3;~9L{dbO=Y~ysd|>06mLe+2M{S#;ca+7;1Ntmb1ALNekW!%S)64_g)-{x}g!M>F z8ZwF%Dj&p;(kk0W`;jFjs+KZQKc)fAM-W6^s$E7gbT7#&?3`8D$SNGRW>s#>s=}64 z#5t>okyRvO%_?fkDrUqp)l$Mpi zf>VF&?qM_kX~vt|(RwhZ1n>7s3>^YHzY;@-Kn%JC+#ybR_S)S`*IvNr3&`A(YSqN+ zHuPS3ktRX+c;@8S)G5uIH3_=MMZw{xJ==OSb&nd#r`YKRAO>t?T3k#eYOo8)HU+j+ zB1EyZz71E@JoucJoonAgUnwy9im0Qn-1fQJH10irtJKD|zlgftKSUk<bhT$8iMt%y+0WBacTs^@{@yxd&0}+X2R@N{N%CCp5`ZyomP7?x+iNt`F5*1Scmq``PImw?`oxcvi6h9wW@>lMLjz8&0bLA? z34WF%F)QnUjWs!XqPFiP)%R|66_X7x>t_b12(>xtEAO>b;X*Y%U{_p75C&DF!)Nz;EN#vF-&t3 zYLF2F+rdT*c8u6Ivh+e`3vNlCp6CNSBt_hQg@ni)yp5q4VtD65N<`kdfV?x0=Y?#s zGM=AKxyX24=*|z5fI@>`ntC4AuV{cmcXFbnTc;>hXiBcUL`kTjJ*L!JNECUKC^BqFl!QbP&1oG=ifAP{ktm|+t)o(0Q*t7K zLB&xisVO;;Ko?E$IS90Uv5f@vXUY;>;LT|6L9rX#roS<=@Im%}*w{#_K`-9DE3`?M zpgX*B(|}J;x@I_!b;SfAseC^^H0dHsO6v2RV%ykP%s#Q> z{inw@Z@z_HYOy;>xYT09rsN*r6Puo$eeT4;wXNvB#qLZ+UAHb#$Jn=H^LJi7M*in>l$)ZrIHR}7r@&Sy1R>8_kbUFR(7@Qp$LueV(`{S&QpSK^|s6Bl*(#`arJ z9P!7ZK|J=sizU=hdI$TUjeXF@KFI9P&aHWK=bPQkHOTD$xPQ*nV|s7ZBw+vSizcePqk*R2-!8zldk zR@IFk&Rm-LjGHUy%{pkkdo`1|Yh90wVKhJ5gbWiZvC$J9LSceSO@O>f6 zScp+b7PujH^pWs2i?_A4PeyCQk?MZ9hGMQ{B~zWO_(t)_{7^~E7?f+2&Tlm~)ld9L)NJl#^IJQ*89)Ibj19rF-rLES#3JTR-%tPz_l2{qh^ zS?4ZTy_tR)%Y&qvwROVI>@WJ*FJm5>RNq?E5NbQ~8BGG)jKA_xVCHcv*)}oiCe{4) z12@kfb1(a4_)3S$!Y`w;uuju9fj_M>PoA}*{lfd$Hj% z9k#h+<{jrBeUmtr5^74vdl$|nHrqrqrPaRW_a{D|ekA*0OimK&NR4%=%{EI7*63!X zj<#9Kwt48KmA@Z3x{KE#r8e6vbtfyC>SV39S?bLboBGln(05Cjr*12y?kuG|ou$<_ zOT7t}8U&?0t8JEg&z4FJ&uX>JQg3FZ1~VzoYMa>HOsEkWFYLo5)kq@-+Z;tPbd@sZ z_1i^Po_E`ix3Jxnv2FJLaKy?d$3L%0V4EjQfBo&?@K-enZ1ctIUWi`S`3f5srbP}N zOt;N4n{Ae{ZBD)Plt?O8%$+GL zq|7#Uma%mHN!&zt#nV`VHk7!yK{mlT=HO8&ps} zNG!UXJ@hM+S`WV7_lTATtfwt~@PeN2Ze>_lVL7xmY#dd@gx20c%bE8Bl|TKsrRSOL z7(_8LI%I({ShC2sb9gA4Bd5wQp8uFka?uYQv!E``wSe!ox{7k?I3~$M!71f9xM7X7*hY)Uxp8UvBt!$q@Vxo-=5` z5s$yYwvNYA2{lIJUt!~qN8Kj>3g-XbyuLH0_rA|;E13Vl=>s3xF!=*b0$V?+`{zrh z9(-JrfdATb;MDTbJ4g>4ypA1N=42H%{G}%pVUT6Kbr+ACK!z z`;XZ8N0|RTb#)uR`@(t5Hp2WPch5Oz-6v5^0{#cecAs?9<%@WR7E$1j^%eSi#GO5y z6;7a-bMFXqFPSyxsplW5<`$2*+r6mkb2T(xgqgqkv(L_*6Mc`}M#P=DsO$4JIK~Lu zd!12nSJt9x-T#Qt3^uKzcl>7m>Q%{Bxo=lH7j=E!EhU4oS+XbJh{0D>!56BC;jwZ; z)f+J@TO9c@Z9B?r!yiA@cXs@amTjm#vjsQnex&R~M!hLU4T@3;tf1;BBQ!+Rn_$!! zsYMlD+f;S+*l5EjTde+!KWd{FC7O;lB72iKSR>s|CeE@;9Cf!%%ps5?-+OIC7J5pQ96(8EycY_#cibGOiEwB z#@$ItdAdK8@?@Vs?#>8i7&jQfg5&H7&GKBmpp>V3L90c@-APG#`utYPv-&?AmPz0M zIeSC=6^3b7l?E^5QfctQ$Wh5Q^^LW&cfLJz8(A5;5XLo=r}#k}d2}##eYIL?v#(0F zFTE)$-Kj}=x_wDg$a9%WZ(fy#Wm$Pudh@EZ@v3BAb<|>T)|Kv-C5@)@Let}PlIcGx z-Koj_>C~jE;s8AcHPl#>Q!^y8$>PI30l@$TgLY0~hnRy$3)Gezgeqj&2tCVn;P zZKp|_ohEH|N;}@pm5lTLMbaQB_h+?JoB?*cKbfy29mhc&Elj9!8ZVp{FnOhHyizt^ zDdsiy#vk4H^6eYhB2&z3;%T|P^>cUdh?ugCh$(M!DI2+zjal)(=@H^uyFhi-W$vq5%&Qr-+xHikIgY5LlfJ43ls-Pg*UT1QA63r(nI zjU+mPNfJHUw7j&9WSU8i__rx*UmiP$Nv4@(@>}uqC-3+rlSCI6TZEUvp-+d7rjM2M zyq7NPvv+!UOfh||Bxm8Isy}fde`-^Ohc#7Qy(=t{9*$_L))6!9O%iJ((;=ogn+4m_ zJ1$=(PukPoB-1vMayGY)07xUbS|doN4Qm8RaFkAf7>@KN`0xHCW+ho=BUxo5S;ZvJ zs9kaO_6vW^+^P&8ik@1<6hD4@$lya)?&2Y*%9~`BjbxRLWR*Kf;HWZ4TD@tNyEURw zstks5$r`ru$pO}u;6=p+Gq4K&1xIXYNq*zUjLeD z*UN5fwKvUbgQgS+yL^7f#dF`($E7V!YuK?%N8 zlyqNewZIy8QljMKMX}lESd#3yU@OR4J=t@iWAM&Oavlsz604G&KB%FFayYUGF%DYQ z&_OFY5pNt>tYJQKWD&owAZri&DM~^eqp_&9k*Q^?o1~Gc?MFt|g?N%8>q0u%<#ZQ1 znOa9O^d}BZN;;WZ!;-9IYQ4$S8a8Ad-)g~^duQE*=Fum&o!Q>m*-marspA%Qbc~nef7v_rKPmJtQ>WU)+WKdKe!csW z_3wW=^ZSuI*96t5;rIR)rhXm;R29DL>P_{6J~-{lS8A8!6l;j}lNLy`SnZuLhQ1-qW!b;FpN$l9M# z{g3+e?x`dz%#(eGWLR$@a|F0A=%TJ}a=L81zRKT!;_~zQkWrB33(m}PCF$ea^y^d0 z)FO2z{|?n6ORB^BiYFPkU-&Nc;lt5F`tzqqN}rlSC7(r%XHp%0a5(C=G$$6r(a{{nLAbeRAE literal 0 HcmV?d00001 diff --git a/test/test_papillon.py b/test/test_papillon.py index 88edd88..ab893c5 100644 --- a/test/test_papillon.py +++ b/test/test_papillon.py @@ -68,7 +68,7 @@ def test_papillon_db(self): self.assertTrue(a==c and c==24) print_test=pp.read_folder(path) printable="Samples: ['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4']\nComparison: ['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4']\nGenes Detected: 5\nGenes differential expressed: 3\nIsoform Detected: 28\nIsoform differential expressed: 5\n" - print(print_test.__str__(),"\n",printable) +# print(print_test.__str__(),"\n",printable) self.assertTrue(print_test.__str__()==printable) del print_test @@ -693,7 +693,7 @@ def image_check(): im2=Image.open('Test_files/Papillon/Plot.png') hash1 = imagehash.average_hash(im1) hash2 = imagehash.average_hash(im2) - print(hash1,hash2) +# print(hash1,hash2) self.assertEqual(hash1,hash2) sub=test.get_gene() @@ -734,7 +734,7 @@ def image_check(): im2=Image.open('Test_files/Papillon/small-heatmap.png') hash1 = imagehash.average_hash(im1) hash2 = imagehash.average_hash(im2) - print(hash1,hash2) +# print(hash1,hash2) self.assertEqual(hash1,hash2) sub=test.get_gene() @@ -756,6 +756,105 @@ def image_check(): heatmap_maker(None,"isoform") sub.heatmap(z_score=False,export=True) image_check() + + def test_print(self): + test=pp.read_folder(path) + sub=test.get_gene() + printable2="Type of selection: gene\nNumber of gene selected: 3\nSamples: ['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4']\nComparison selected: ['Sample 1_vs_Sample 2', 'Sample 1_vs_Sample 3', 'Sample 1_vs_Sample 4', 'Sample 2_vs_Sample 3', 'Sample 2_vs_Sample 4', 'Sample 3_vs_Sample 4']\n" + self.assertTrue(sub.__str__()==printable2) + + def test_add(self): + test=pp.read_folder(path) + # test isoform + sub1=test.get_isoform("IL6") + sub2=test.get_isoform("CD44") + sub=sub1+sub2 + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),len(sub1.df.columns)) + sub3=test.get_isoform("CCL15") + sub=sum([sub1,sub2,sub3]) + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),len(sub1.df.columns)) + sub4=test.get_isoform("IL6") + sub=sub+sub4 + self.assertEqual(len(sub.df),5) + self.assertEqual(len(sub.df.columns),len(sub1.df.columns)) + + # test genes + sub_g1=test.get_gene("IL6") + sub_g2=test.get_gene("IL17RC") + sub=sub_g1+sub_g2 + self.assertEqual(len(sub.df),2) + self.assertEqual(len(sub.df.columns),len(sub_g1.df.columns)) + sub_g3=test.get_gene("CCL15") + sub=sum([sub_g1,sub_g2,sub_g3]) + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),len(sub_g1.df.columns)) + sub_g4=test.get_gene("IL17RC") + sub=sub+sub_g4 + self.assertEqual(len(sub.df),3) + self.assertEqual(len(sub.df.columns),len(sub1.df.columns)) + + with self.assertRaises(Exception): + sub=sub1+sub_g2 + with self.assertRaises(Exception): + sub_g1=test.get_gene("IL6") + sub_g2=test.get_gene(comparison="Sample 3_vs_Sample 4") + sub=sub_g1+sub_g2 + + def test_list_search(self): + test=pp.read_folder(path) + sub=test.get_isoform() + sub_search=sub.search("sfd") + self.assertEqual(len(sub_search.df),0) + sub_search=sub.search("00") + self.assertEqual(len(sub_search.df),3) + self.assertEqual(len(sub.df.columns),len(sub_search.df.columns)) + + sub=test.get_gene() + sub_search=sub.search("sfd") + self.assertEqual(len(sub_search.df),0) + sub_search=sub.search("il") + self.assertEqual(len(sub_search.df),2) + self.assertEqual(len(sub.df.columns),len(sub_search.df.columns)) + + def test_sub_select(self): + test=pp.read_folder(path) + sub=test.get_isoform() + a=sub.select("IL6") + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6"]) + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6","wrong"]) + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6","CCL15"]) + self.assertEqual(len(a.df),3) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + b=test.get_isoform(["IL6","CCL15"]) + a=sub.select(b) + self.assertEqual(len(a.df),3) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + + sub=test.get_gene() + a=sub.select("IL6") + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6"]) + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6","wrong"]) + self.assertEqual(len(a.df),1) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + a=sub.select(["IL6","CCL15"]) + self.assertEqual(len(a.df),2) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) + b=test.get_isoform(["IL6","CCL15"]) + a=sub.select(b) + self.assertEqual(len(a.df),2) + self.assertEqual(len(a.df.columns),len(sub.df.columns)) if __name__ == '__main__': unittest.main() From ba3ca2600965b739436df04b207b2d27355446ad Mon Sep 17 00:00:00 2001 From: "domenico.somma" Date: Mon, 24 Sep 2018 15:26:05 +0100 Subject: [PATCH 5/8] v 0.2.0 Manual update --- docs/manual.md | 414 +++++++++++++++++++++++++++++++++++++++++++++ docs/manual.rst | 440 ------------------------------------------------ 2 files changed, 414 insertions(+), 440 deletions(-) create mode 100644 docs/manual.md delete mode 100644 docs/manual.rst diff --git a/docs/manual.md b/docs/manual.md new file mode 100644 index 0000000..28e2e9f --- /dev/null +++ b/docs/manual.md @@ -0,0 +1,414 @@ +## Installation +You can install Papillon using Pypi: + +`pip install papillon` + +or Anaconda through conda-forge either installing only papillon: + +`conda install -c conda-forge papillon` + +or adding the the conda-forge channel and then installing it: + +`conda config --add channels conda-forge` + +`conda install papillon` + +It should work with any IDE for data science (I tested jupyter-notebook and spyder). + +## Before To start + +After RNA-seq analysis with Galaxy, + +download the 4 files generated by cuffdiff containing respectively: + +_... transcript_FPKM_tracking..._ + +_... gene_FPKM_tracking ..._ + +_... gene_differential_expression..._ + +_... transcript_differential_expression..._ + +And put them in the same folder. + +You can either use directly the files or change the names according to + +cummeRbund in: + +_... transcript_FPKM_tracking... = isoforms.fpkm_tracking_ + +_... gene_FPKM_tracking ... = gene.fpkm_tracking_ + +_... gene_differential_expression... = gene_exp.diff_ + +_... transcript_differential_expression... = isoform_exp.diff_ + +Functions +--------- + +**read\_files(files, path=None, drop\_comparison=None)** + +_Accept cuffdiff/cummeRbund files as iterable ("transcript\_FPKM\_tracking", "gene\_FPKM\_tracking", "gene\_differential\_expression", "transcript\_differential\_expression") and return them to \_papillon\_builder() to create a Papillon\_db object._ + + Parameters + + files - accept an iterable with the cuffdiff files names + + path - where to export Papillon generated files + + drop\_comparison - drop comparison (str) or list of comparisons to drop +from the cuffdiff table + + Example + +pp.read\_files(["Files/gene\_exp.diff","Files/genes.fpkm\_tracking","Files/isoform\_exp.diff","Files/isoforms.fpkm\_tracking"]) + + +*** + + +**read\_folder(path, drop\_comparison=None)** + +_Read the folder containing the cuffdiff/cummeRbund files, and return them to \_papillon\_builder() to create a Papillon\_db object._ + + Parameters + + path - accept a str with the folder path, containing the cuffdiff files + + drop\_comparison - drop comparison (str) or list of comparisons to drop +from the cuffdiff table + + Example + + MyProject=pp.read\_folder("MyFolder/Test\_files") + +*** + +**read\_db(path, drop\_comparison=None)** + +_Deprecated. Use read\_folder() instead._ + + +*** + + +Classes +------- + +### class Papillon\_db(builtins.object) + + _Create a Papillon\_db object using read\_folder() or read\_files() and_ +_\_papillon\_builder_ + +#### Methods defined here: + +**\_\_init\_\_(self, path, samples, comparisons, genes\_detected,** +**isoforms\_detected)** + +_self.path - files path_ + +_self.samples - samples found_ + +_self.comparisons - comparisons found_ + +_self.genes\_detected - dataframe of genes detected_ + +_self.isoforms\_detected - dataframe of isoforms detected_ + +**\_\_str\_\_(self)** + +_Return str(self)._ + +*** + +**change\_order(self, new\_order)** + +_Change the samples order_ + + Parameters + + new\_order: list of samples order + + Example + + MyProject.change\_order(["Sample 4","Sample 3","Sample 2","Sample 1"]) + + +*** + + +**drop\_comparison(self, comparison)** + +_Drop Comparison (str) or list of comparisons_ + + Parameters + + comparison: comparison (str) or list of comparisons + + Example + + MyProject.drop\_comparison(specific\_comparison) + +*** + +**get\_gene(self, genelist=None, comparison='all', comparison\_sign=None,** +**fold\_ind=None, fold\_sign='>')** + +_This method select genes per name or conditions. It return a_ +_Papillon\_list object_ + + Parameters + +genelist - accept string (1 gene name), list of gene names or file with a list of gene names + +comparison - To select genes higher/lower in one condition compared to another. Accept either "all" to pass all the comparisons, or accept only 1 comparison as str (already present in the data) + +comparison\_sign - usable in combination with comparison, accept either ">" or "<" + +fold\_ind - fold induction (log2) higher/lower then number + +fold\_sign - usable in combination with fold\_ind, accept either ">" or "<" + + Example + +Selection=MyProject.get\_gene(["CD44","CCL15"], comparison="Sample 1\_vs\_Sample 2", comparison\_sign="<", fold\_ind=1, fold\_sign="<") + +*** + +**get\_isoform(self, genelist=None, comparison='all', comparison\_sign=None, fold\_ind=None, fold\_sign='>')** + +_This function select isoforms. It creates a Papillon object_ + + Parameters + +genelist - accept string (1 gene name), list of gene names or file with +a list of gene names + +comparison - To select genes higher/lower in one condition compared to +another. Accept either "all" to pass all the comparisons, or accept only +1 comparison as str (already present in the data) + + comparison\_sign - usable in combination with comparison, accept either +">" or "<" + + fold\_ind - fold induction (log2) higher/lower then number + + fold\_sign - usable in combination with fold\_ind, accept either ">" or +"<" + + Example + + Selection=MyProject.get\_isoform(["IL6","CCL15","IL17RC"], +comparison="Sample 2\_vs\_Sample 4", comparison\_sign="<") + +*** + +**search(self, word, where, how='table', export=False)** + + _search among genes/isoforms names in detected and significant_ + + Parameters + + word - accept a str to search among the gene names + + where - accept: + + "genes\_detected" + + "genes\_significant" + + "isoforms\_detected" + + "isoforms\_significant" + + how - accept: + + "table" return the dataframe with the genes found + + "list" return a list of names, no duplicates + + export - True/False + + Example + + search\_result=MyProject.search(word="IL6",where="genes\_significant", +how="table") + +*** + +### class Papillon\_list(builtins.object) + +_Class containing a selected list of genes, with data associated,_ +_generated by Papillon\_db.get\_gene() or Papillon\_db.get\_isoform()_ +_methods_ + +_Is possible add the content of two or more Papillon\_list objects._ + +_Example:_ + +_P\_list3 = P\_list1+P\_list2_ + +_P\_list4=sum([PL1, PL2, PL3])_ + +#### Methods defined here: + +\_\_add\_\_(self, other) + +\_\_init\_\_(self, df, what, comparisons, path, samples, +comparison='all', comparison\_sign=None, fold\_ind=None, fold\_sign='>', +p=0.05) + + Initialize self. See help(type(self)) for accurate signature. + +\_\_radd\_\_(self, other) + +\_\_str\_\_(self) + + Return str(self). + +*** + +**compare(self, other)** + +_Compare two Papillon\_list objects_ + + Parameters + + other: another Papillon\_list object + + Example + + Selection1.compare(Selection2) + +*** + +**export(self)** + +_Export the selected genes/isoforms as excel file._ + + Example + + Selection.export() + +*** + +**heatmap(self, z\_score=False, col\_cluster=False, method='complete',** +**cmap='seismic', export=False, \*\*options)** + +_Generate heatmap using selected genes/isoforms_ + + Parameters + + z\_score - True/False whether want or not apply z-score normalization + + col\_cluster - True/False whether want or not cluster the samples + + method - clustering algorithm - default is complete-linkage + + cmap - map color + + export - True/False whether want or not export the dataframe of + + selected genes + + \*\*options - all the options accepted by seaborn.clustermap + + default metric is euclidean. + + Example + + Selection.heatmap(z\_score=True,export=True) + +*** + +**lineplot(self, title='', legend=True, z\_score=False, export=False,** +**size=10, ci=None, \*\*option)** + +_LinePlot selected genes expression levels. Max number of genes 200_ + + Parameters + + title - accept a str as title of the plot + + legend - True/False show the legend + + z\_score - True/False calculate the z-score normalization + + export - True/False whether or not export the image + + size - change the size of the plot + + \*\*options - all the options accepted by seaborn.factorplot + + Example + + Selection.lineplot(export=True,z\_score=True) + +*** + +**onlyFPKM(self, return\_as, remove\_FPKM\_name=False)** + + _Take a Papillon\_list object and return only FPKM columns._ + + Parameters + + return as: + + "df" - pandas DataFrame + + "array" - numpy array + + "gene name" - pandas DataFrame containing gene names + + remove\_FPKM\_name: True/False + + Example + + df=Selection.onlyFPKM("df") + +*** + +**plot(\*\*parameter)** + +_Deprecated. Use self.lineplot() instead_ + +*** + +**search(self, string)** + +_Search a string in the Papillon\_list_ + + Parameters + + string - accept a str to search among the gene names + + Example + + search\_results=Selection.search("IL") + +*** + +**select(self, genelist)** + +_Create another Papillon\_list object_ + + Parameters + + genelist: accept string (1 gene name), list of gene names or file + + with a list of gene names + + Example + + Selection2=Selection1.select(["IL6","CCL15"]) + +*** + +**show(self)** + +_Show genes/isoforms as Dataframe_ + + Example + + Selection.show() + diff --git a/docs/manual.rst b/docs/manual.rst deleted file mode 100644 index eaddbed..0000000 --- a/docs/manual.rst +++ /dev/null @@ -1,440 +0,0 @@ -Papillon -======== - -A python version of CummeRbund to read and plot Galaxy/cuffdiff RNA-seq -data - -Before To start ---------------- - -After RNA-seq analysis with Galaxy, - -download the 4 files generated by cuffdiff containing respectively: - -... transcript\_FPKM\_tracking... - -... gene\_FPKM\_tracking ... - -... gene\_differential\_expression... - -... transcript\_differential\_expression... - -And put them in the same folder. - -You can either use directly the files or change the names according to - -cummeRbund in: - -... transcript\\\_FPKM\\\_tracking... = isoforms.fpkm\_tracking - -... gene\\\_FPKM\\\_tracking ... = gene.fpkm\_tracking - -... gene\\\_differential\\\_expression... = gene\_exp.diff - -... transcript\\\_differential\\\_expression... = isoform\_exp.diff - -Functions ---------- - -read\_db(path, drop\_comparison=None) - - Deprecated. Use read\_folder() instead. - -read\_files(files, path=None, drop\_comparison=None) - - Accept cuffdiff/cummeRbund files as iterable - - ("transcript\_FPKM\_tracking", - - "gene\_FPKM\_tracking", - - "gene\_differential\_expression", - - "transcript\_differential\_expression") - - and return them to \_papillon\_builder() to create a Papillon\_db -object. - - Parameters - - ---------- - - files - accept an iterable with the cuffdiff files names - - path - where to export Papillon generated files - - drop\_comparison - drop comparison (str) or list of comparisons to drop -from the cuffdiff table - - Example - - ---------- - - -pp.read\_files(["Files/gene\_exp.diff","Files/genes.fpkm\_tracking","Files/isoform\_exp.diff","Files/isoforms.fpkm\_tracking"]) - -read\_folder(path, drop\_comparison=None) - - Read the folder containing the cuffdiff/cummeRbund files, and return - - them to \_papillon\_builder() to create a Papillon\_db object. - - Parameters - - ---------- - - path - accept a str with the folder path, containing the cuffdiff files - - drop\_comparison - drop comparison (str) or list of comparisons to drop -from the cuffdiff table - - Example - - ---------- - - MyProject=pp.read\_folder("MyFolder/Test\_files") - -Classes -------- - -class Papillon\_db(builtins.object) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - Create a Papillon\_db object using read\_folder() or read\_files() and -\_papillon\_builder - - Methods defined here: - -\_\_init\_\_(self, path, samples, comparisons, genes\_detected, -isoforms\_detected) - - self.path - files path - - self.samples - samples found - - self.comparisons - comparisons found - - self.genes\_detected - dataframe of genes detected - - self.isoforms\_detected - dataframe of isoforms detected - -\_\_str\_\_(self) - - Return str(self). - -change\_order(self, new\_order) - - Change the samples order - - Parameters - - ---------- - - new\_order: list of samples order - - Example - - ---------- - - MyProject.change\_order(["Sample 4","Sample 3","Sample 2","Sample 1"]) - -drop\_comparison(self, comparison) - - Drop Comparison (str) or list of comparisons - - Parameters - - ---------- - - comparison: comparison (str) or list of comparisons - - Example - - ---------- - - MyProject.drop\_comparison(specific\_comparison) - -get\_gene(self, genelist=None, comparison='all', comparison\_sign=None, -fold\_ind=None, fold\_sign='>') - - This method select genes per name or conditions. It return a -Papillon\_list object - - Parameters - - ---------- - - genelist - accept string (1 gene name), list of gene names or file with -a list of gene names - - comparison - To select genes higher/lower in one condition compared to -another. Accept either "all" to pass all the comparisons, or accept only -1 comparison as str (already present in the data) - - comparison\_sign - usable in combination with comparison, accept either -">" or "<" - - fold\_ind - fold induction (log2) higher/lower then number - - fold\_sign - usable in combination with fold\_ind, accept either ">" or -"<" - - Example - - ---------- - - Selection=MyProject.get\_gene(["CD44","CCL15"], comparison="Sample -1\_vs\_Sample 2", comparison\_sign="<", fold\_ind=1, fold\_sign="<") - -get\_isoform(self, genelist=None, comparison='all', -comparison\_sign=None, fold\_ind=None, fold\_sign='>') - - This function select isoforms. It creates a Papillon object - - Parameters - - ---------- - - genelist - accept string (1 gene name), list of gene names or file with -a list of gene names - - comparison - To select genes higher/lower in one condition compared to -another. Accept either "all" to pass all the comparisons, or accept only -1 comparison as str (already present in the data) - - comparison\_sign - usable in combination with comparison, accept either -">" or "<" - - fold\_ind - fold induction (log2) higher/lower then number - - fold\_sign - usable in combination with fold\_ind, accept either ">" or -"<" - - Example - - ---------- - - Selection=MyProject.get\_isoform(["IL6","CCL15","IL17RC"], -comparison="Sample 2\_vs\_Sample 4", comparison\_sign="<") - -search(self, word, where, how='table', export=False) - - search among genes/isoforms names in detected and significant - - Parameters - - ---------- - - word - accept a str to search among the gene names - - where - accept: - - "genes\_detected" - - "genes\_significant" - - "isoforms\_detected" - - "isoforms\_significant" - - how - accept: - - "table" return the dataframe with the genes found - - "list" return a list of names, no duplicates - - export - True/False - - Example - - ---------- - - search\_result=MyProject.search(word="IL6",where="genes\_significant", -how="table") - -class Papillon\_list(builtins.object) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - Class containing a selected list of genes, with data associated, -generated by Papillon\_db.get\_gene() or Papillon\_db.get\_isoform() -methods - -Is possible add the content of two or more Papillon\_list objects. - -Example: - -P\_list3 = P\_list1+P\_list2 - -P\_list4=sum([PL1, PL2, PL3]) - - Methods defined here: - -\_\_add\_\_(self, other) - -\_\_init\_\_(self, df, what, comparisons, path, samples, -comparison='all', comparison\_sign=None, fold\_ind=None, fold\_sign='>', -p=0.05) - - Initialize self. See help(type(self)) for accurate signature. - -\_\_radd\_\_(self, other) - -\_\_str\_\_(self) - - Return str(self). - -compare(self, other) - - Compare two Papillon\_list objects - - Parameters - - ---------- - - other: another Papillon\_list object - - Example - - ---------- - - Selection1.compare(Selection2) - -export(self) - - Export the selected genes/isoforms as excel file. - - Example - - ---------- - - Selection.export() - -heatmap(self, z\_score=False, col\_cluster=False, method='complete', -cmap='seismic', export=False, \*\*options) - - Generate heatmap using selected genes/isoforms - - Parameters - - ---------- - - z\_score - True/False whether want or not apply z-score normalization - - col\_cluster - True/False whether want or not cluster the samples - - method - clustering algorithm - default is complete-linkage - - cmap - map color - - export - True/False whether want or not export the dataframe of - - selected genes - - \*\*options - all the options accepted by seaborn.clustermap - - default metric is euclidean. - - Example - - ---------- - - Selection.heatmap(z\_score=True,export=True) - -lineplot(self, title='', legend=True, z\_score=False, export=False, -size=10, ci=None, \*\*option) - - LinePlot selected genes expression levels. Max number of genes 200 - - Parameters - - ---------- - - title - accept a str as title of the plot - - legend - True/False show the legend - - z\_score - True/False calculate the z-score normalization - - export - True/False whether or not export the image - - size - change the size of the plot - - \*\*options - all the options accepted by seaborn.factorplot - - Example - - ---------- - - Selection.lineplot(export=True,z\_score=True) - -onlyFPKM(self, return\_as, remove\_FPKM\_name=False) - - Take a Papillon\_list object and return only FPKM columns. - - Parameters - - ---------- - - return as: - - "df" - pandas DataFrame - - "array" - numpy array - - "gene name" - pandas DataFrame containing gene names - - remove\_FPKM\_name: True/False - - Example - - ---------- - - df=Selection.onlyFPKM("df") - -plot(\*\*parameter) - - Deprecated. Use self.lineplot() instead - -search(self, string) - - Search a string in the Papillon\_list - - Parameters - - ---------- - - string - accept a str to search among the gene names - - Example - - ---------- - - search\_results=Selection.search("IL") - -select(self, genelist) - - Create another Papillon\_list object - - Parameters - - ---------- - - genelist: accept string (1 gene name), list of gene names or file - - with a list of gene names - - Example - - ---------- - - Selection2=Selection1.select(["IL6","CCL15"]) - -show(self) - - Show genes/isoforms as Dataframe - - Example - - ---------- - - Selection.show() From f66f943cb97b0ba49478749a54eb2aeed03cb055 Mon Sep 17 00:00:00 2001 From: "domenico.somma" Date: Mon, 24 Sep 2018 15:27:56 +0100 Subject: [PATCH 6/8] v 0.2.0 --- CHANGE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGE.txt b/CHANGE.txt index 3f93eae..6269fed 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,4 +1,4 @@ -v 0.2.0, -18 -- Major changes: +v 0.2.0, 24-9-18 -- Major changes: Now seaborn 0.23 required Now you can keep the genes/isoforms subselection in a variable Add or compare two subselections From 68f752800832d4d8b74a55db1ff8d730e75fe097 Mon Sep 17 00:00:00 2001 From: "domenico.somma" Date: Mon, 24 Sep 2018 15:29:21 +0100 Subject: [PATCH 7/8] v 0.2.0 --- CHANGE.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGE.txt b/CHANGE.txt index 6269fed..64868e7 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,6 +1,6 @@ v 0.2.0, 24-9-18 -- Major changes: -Now seaborn 0.23 required -Now you can keep the genes/isoforms subselection in a variable +Now Pandas 0.23 required +Now you can keep the genes/isoforms selection in a variable Add or compare two subselections You can select either gene/isoform significant expressed for at least one condition or not significant at all. Plot gene/isoform significant for at least one condition or not significant with continuous and dashed line. From ddac073959511b72e6bb1943263b2ae4909ac711 Mon Sep 17 00:00:00 2001 From: Domenico <34346930+domenico-somma@users.noreply.github.com> Date: Mon, 24 Sep 2018 15:57:22 +0100 Subject: [PATCH 8/8] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5357f01..96afc5b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ url='https://github.com/domenico-somma/Papillon/', python_requires='>=3.3, <4', install_requires=[ - "pandas >= 0.17.1", + "pandas >= 0.23", "Seaborn >= 0.8.1", ], )