-
Notifications
You must be signed in to change notification settings - Fork 2
/
01_extract_wells.py
89 lines (76 loc) · 3.22 KB
/
01_extract_wells.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python
# Authors: L. Penter, L. Hansmann
# Hematology, Oncology, and Tumor Immunology, Charité - Universitätsmedizin Berlin
# process fasta file containing all reads and extract sequences according to originating well
# output is a directory with fasta files containing T cell receptor (TCR) or B cell receptor (BCR)
# reads sorted by well
#
# process reads: 01_extract_wells.py <fasta file>
#
# the fasta file can be generated for example with PEAR
#
# 2018-01-09: minor fixes in parsing of fasta file
import subprocess
import re
import os
import sys
import argparse
#command line arguments
parser = argparse.ArgumentParser(description='Analyze TCR and BCR reads and extract')
parser.add_argument('inputfile', type=str, help='Filename of fasta input file')
parser.add_argument('-output', type=str, help='output directory')
args = parser.parse_args()
sequences=args.inputfile
# create output directorires if necessary
if args.output:
os.mkdir(args.output)
os.mkdir(args.output+'/TCR/')
# plate, row, and column barcodes in numerical/alphabetical order
TCR_plate_barcodes =[ 'GCAGA', 'TCGAA', 'AACAA', 'GGTGC', 'TTGGT', 'CATTC', 'ATTGG', 'CGGTT', 'ATCCT', 'ATGTC', 'TCACG', 'AGACC', 'CCCCA', 'GCGCT', 'TCCTT', 'TATAT', 'CGTAA', 'AAGGT', 'AGCTC', 'CCTGC', 'GTATC', 'TATGA', 'CACAC', 'ACACT', 'ACTAC', 'GTTAC']
TCR_row_barcodes = [ 'TAAGC', 'TGCAC', 'CTCAG', 'GGAAT', 'CGAGG', 'AGGAG', 'TGTTG', 'CAACT']
TCR_column_barcodes = ['TGAAC', 'TCCTG', 'TATAA', 'ACAGG', 'GCGGT', 'TAAGT', 'CTAGC', 'ACGTC', 'TAGCC', 'CATTC', 'GTTGG', 'GTCTC']
revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A', 'N':'N'}[B] for B in x][::-1])
t_ass_seqs={}
b_ass_seqs={}
unassigned=0
assigned=0
read_number=0
tcr_count=0
bcr_count=0
# open fasta file
seq=open(sequences, mode='rU')
DNA=set('ATGC\n')
# parse fasta file line by line
for line in seq:
if set(line) <= DNA:
read_number+=1
# print out statistics of progress
print '\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count,
line=line.replace('\n','')
# T cell receptor read found
if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \
not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line:
location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1)
if location in t_ass_seqs:
t_ass_seqs[location].append(revcompl(line[14:].replace('\n','')))
else:
t_ass_seqs[location]=[revcompl(line[14:].replace('\n', ''))]
tcr_count+=1
assigned+=1
# sequence not assigned
else:
unassigned+=1
# statistics of assigned and unassigned sequences
print 'Unassigned: ', unassigned, ' Assigned: ', assigned
seq.close()
# write sequences to files
# T cell receptor reads
for key in t_ass_seqs.keys():
counter=1
outfile=open(args.output+'/TCR/' + key + '.fasta', mode='w')
for line in t_ass_seqs[key]:
outfile.write('>' + str(counter) + '\n')
outfile.write(line + '\n')
counter=counter+1
outfile.close()
print('well files written')