version 3.5

MatthiasLienhard · Sep 13, 2023 · deed45e · deed45e
1 parent 94ebb9f
commit deed45e
Show file tree

Hide file tree

Showing 13 changed files with 426 additions and 400 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@
 * planned new feature: during import of long reads, (optionally) correct for short exon alignment issues. 
 * separate new read import and classification of isoforms.
 
+## [0.3.5]
+* fixed a bug in domain plots, which was introduced in 0.3.4
+
 ## [0.3.4]
 * fixing #8: AssertationError when unifying TSS/PAS between transcript
 * improved domain plots: ORF start and end do not appear like exon exon boundaries. 

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.3.4
+0.3.5
diff --git a/docs/notebooks/01_prepare_data.ipynb b/docs/notebooks/01_prepare_data.ipynb
@@ -347,7 +347,7 @@
    "source": [
     "## Demonstration Data\n",
     "\n",
-    "To create an demonstration data set, we aligned the encode fastq files with minimap2, and sub-selected reads mapping to chromosome 8 only. All resulting files (~270 Mb) [can be downloaded here](https://oc-molgen.gnz.mpg.de/owncloud/s/gjG9EPiQwpRAyg3).\n",
+    "To create an demonstration data set, we aligned the encode fastq files with minimap2, and sub-selected reads mapping to chromosome 8 only. All resulting files (~270 Mb) [can be downloaded here](https://nc.molgen.mpg.de/cloud/index.php/s/zYe7g6qnyxGDxRd).\n",
     "``` bash\n",
     "\n",
     "\n",

diff --git a/docs/notebooks/03_transcriptome_reconstruction.ipynb b/docs/notebooks/03_transcriptome_reconstruction.ipynb
diff --git a/docs/notebooks/03b_transcriptome_import.ipynb b/docs/notebooks/03b_transcriptome_import.ipynb
@@ -41,7 +41,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO:This is isootools version 0.3.3\n"
+      "INFO:This is isootools version 0.3.5\n"
      ]
     }
    ],
@@ -70,11 +70,11 @@
      "output_type": "stream",
      "text": [
       "INFO:importing reference from gff3 file demonstration_dataset/gencode.v42.chr_patch_hapl_scaff.annotation_sorted_chr8.gff3.gz\n",
-      "100%|█████████▉| 2.82M/2.82M [00:02<00:00, 1.36MB/s]\n",
-      "INFO:skipped the following categories: {'CDS', 'three_prime_UTR', 'five_prime_UTR'}\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 2.82M/2.82M [00:02<00:00, 1.22MB/s]\n",
+      "INFO:skipped the following categories: {'five_prime_UTR', 'CDS', 'three_prime_UTR'}\n",
       "WARNING:Missing genes! Found gene information in categories ['gene'] for 2540/5080 genes\n",
       "INFO:adding samples \"GM12878_pooled\", \"K562_pooled\" from csv\n",
-      "100%|██████████| 2611/2611 [00:06<00:00, 391.00genes/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2611/2611 [00:09<00:00, 276.43genes/s]\n"
      ]
     },
     {
@@ -110,30 +110,30 @@
        "      <th>0</th>\n",
        "      <td>GM12878_pooled</td>\n",
        "      <td>demonstration_dataset/demonstration_dataset_su...</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>GM12878_pooled</td>\n",
        "      <td>125138</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>K562_pooled</td>\n",
        "      <td>demonstration_dataset/demonstration_dataset_su...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>125138</td>\n",
+       "      <td>K562_pooled</td>\n",
+       "      <td>142708</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             name                                               file group  \\\n",
-       "0  GM12878_pooled  demonstration_dataset/demonstration_dataset_su...   NaN   \n",
-       "0     K562_pooled  demonstration_dataset/demonstration_dataset_su...   NaN   \n",
+       "             name                                               file  \\\n",
+       "0  GM12878_pooled  demonstration_dataset/demonstration_dataset_su...   \n",
+       "0     K562_pooled  demonstration_dataset/demonstration_dataset_su...   \n",
        "\n",
-       "  nonchimeric_reads chimeric_reads  \n",
-       "0            125138              0  \n",
-       "0            125138              0  "
+       "            group nonchimeric_reads chimeric_reads  \n",
+       "0  GM12878_pooled            125138              0  \n",
+       "0     K562_pooled            142708              0  "
       ]
      },
      "execution_count": 2,
@@ -147,7 +147,7 @@
     "isoseq=Transcriptome.from_reference(annotation_fn)\n",
     "\n",
     "#specify the columns with the read counts per transcript\n",
-    "read_count_cols={'GM12878_pooled':'GM12878_sum_coverage','K562_pooled':'GM12878_sum_coverage'}\n",
+    "read_count_cols={'GM12878_pooled':'GM12878_sum_coverage','K562_pooled':'K562_sum_coverage'}\n",
     "\n",
     "# add the transcripts from the gtf file\n",
     "id_map=isoseq.add_sample_from_csv(\n",
@@ -163,6 +163,13 @@
     "isoseq.sample_table"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/docs/notebooks/04_saturation_analysis.ipynb b/docs/notebooks/04_saturation_analysis.ipynb
diff --git a/docs/notebooks/05_qc.ipynb b/docs/notebooks/05_qc.ipynb
diff --git a/docs/notebooks/06_filtering.ipynb b/docs/notebooks/06_filtering.ipynb
@@ -17,7 +17,7 @@
     "* define custom tags and filter expressions, to tailor filter queries.\n",
     "\n",
     "\n",
-    "This tutorial depends on the transcriptome file PacBio_isotools_substantial_isotools.pkl, which can be obtained with this [download link](https://nc.molgen.mpg.de/cloud/index.php/s/zYe7g6qnyxGDxRd)"
+    "This tutorial assumes you have run the tutorial on [transcriptome reconstruction](03_transcriptome_reconstruction.html) already, and prepared the transcriptome pkl file *\"PacBio_isotools.pkl\"* based on the [demonstration data set](https://nc.molgen.mpg.de/cloud/index.php/s/zYe7g6qnyxGDxRd)."
    ]
   },
   {
@@ -32,7 +32,7 @@
     "import pandas as pd\n",
     "\n",
     "path='demonstration_dataset'\n",
-    "isoseq=Transcriptome.load(f'{path}/PacBio_isotools_substantial_isotools.pkl')"
+    "isoseq=Transcriptome.load(f'{path}/PacBio_isotools.pkl')"
    ]
   },
   {
@@ -123,15 +123,14 @@
      "text": [
       "\n",
       "These are the infos for this transcript:\n",
-      "exons: [[89757792, 89758233], [89762828, 89762982], [89765340, 89765496], [89769771, 89769929], [89771740, ...\n",
+      "exons: [[89757791, 89758233], [89762828, 89762982], [89765340, 89765496], [89769771, 89769929], [89771740, ...\n",
       "strand: +\n",
       "coverage: {'GM12878_a': 48, 'GM12878_b': 135, 'GM12878_c': 138, 'K562_a': 19, 'K562_b': 18, 'K562_c': 13}\n",
       "TSS: {'GM12878_a': {89757763: 1, 89757766: 1, 89757778: 1, 89757782: 3, 89757783: 4, 89757784: 4, 8975778...\n",
       "PAS: {'GM12878_a': {89790462: 3, 89791057: 3, 89790608: 5, 89790939: 3, 89790606: 3, 89790425: 1, 8979046...\n",
       "annotation: (0, {'FSM': [1]})\n",
-      "TSS_unified: {'GM12878_a': {89757792: 48}, 'GM12878_b': {89757792: 135}, 'GM12878_c': {89757792: 138}, 'K562_a': ...\n",
-      "PAS_unified: {'GM12878_a': {89790463: 20, 89790983: 10, 89790606: 11, 89790940: 3, 89790525: 4}, 'GM12878_b': {89...\n",
-      "ORF: (89758060, 89790416, {'start': 268, 'length': 1623, 'start_codon': 'ATG', 'stop_codon': 'TAA', 'NMD'...\n",
+      "TSS_unified: {'GM12878_a': {89757791: 48}, 'GM12878_b': {89757791: 135}, 'GM12878_c': {89757791: 138}, 'K562_a': ...\n",
+      "PAS_unified: {'GM12878_a': {89790462: 20, 89790982: 10, 89790605: 11, 89790939: 3, 89790524: 4}, 'GM12878_b': {89...\n",
       "direct_repeat_len: [3, 3, 4, 5, 7, 3, 5, 4, 6, 4]\n",
       "downstream_A_content: 0.2\n"
      ]
@@ -411,7 +410,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 10803/10803 [00:00<00:00, 15873.69genes/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 10803/10803 [00:00<00:00, 17189.88genes/s]\n"
      ]
     }
    ],

diff --git a/docs/notebooks/08_alternative_splicing.ipynb b/docs/notebooks/08_alternative_splicing.ipynb