Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 90 additions & 33 deletions proteome_analysis/translate_trsk_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import SingleLetterAlphabet
from gfftools import GFFParser, helper
from signal_labels import generate_genome_seq_labels as seqlab
import os
import re


def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
"""
translate the trsk genes to protein sequence

def translate_trsk_genes(gtf_file, fas_file, out_seq_fname, max_orf_opt = False):
""" translate the trsk genes to protein sequence
@args gtf_file: genome annotation file
@type gtf_file: str
@type gtf_file: str
@args fas_file: genome sequence file
@type fas_file: str
@args out_seq_fname: output file in fasta format
Expand All @@ -37,36 +38,46 @@ def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
## genome sequence file reading
sys.stdout.write('reading genome sequence from %s\n' % fas_file)
seqlab.chrom_name_consistency(fas_file, anno_db)

cds_idx = [] # deleting the empty cds lines
for idp, feat in enumerate(anno_db):
if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region
cds_idx.append(idp)
anno_db = np.delete(anno_db, cds_idx)
genes_with_cds = len(anno_db)

fasFH = helper.open_file(fas_file)
out_seq_fh = open(out_seq_fname, "w")
for rec in SeqIO.parse(fasFH, "fasta"):
cds_idx = []
if not max_orf_opt:
for idp, feat in enumerate(anno_db):
if not feat['cds_exons'][0].any():
cds_idx.append(idp)
else:
for idp, feat in enumerate(anno_db):
if not feat['exons'][0].any():
cds_idx.append(idp)
anno_db = np.delete(anno_db, cds_idx)
genes_with_cds = 0

fasFH = helper.open_file(fas_file)
out_seq_fh = open(out_seq_fname, 'w')
for rec in SeqIO.parse(fasFH, 'fasta'):
for idx, feature in enumerate(anno_db):
if rec.id == feature['chr']:
## iterate over cds_exons
cds_seq = ''
for ex in feature['cds_exons'][0]:## single transcript by TSkim
cds_seq += rec.seq[ex[0]-1:ex[1]]

if feature['strand'] == '-':
cds_seq = cds_seq.reverse_complement()
##
#sys.stdout.write(str(cds_seq.translate()) + "\n")

## fasta output
if cds_seq:
prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence')
out_seq_fh.write(prt_seq.format("fasta"))

# FIXME need an efficient way to translate multiple gene
# iterate over chromosome
if len(feature['cds_exons'][0]) == 0:
max_orf_opt = True
if not max_orf_opt:
for ex in feature['cds_exons'][0]:
cds_seq += rec.seq[ex[0] - 1:ex[1]]

if feature['strand'] == '-':
cds_seq = cds_seq.reverse_complement()
if cds_seq:
prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence')
out_seq_fh.write(prt_seq.format('fasta'))
else:
start = int(feature['exons'][0][0][0])
end = int(feature['exons'][0][0][1] + 1)
cds_seq += rec.seq[start:end]
if feature['strand'] == '-':
cds_seq = cds_seq.reverse_complement()
cds_seq = find_max_orf(cds_seq)
if cds_seq:
genes_with_cds += 1
prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence')
out_seq_fh.write(prt_seq.format('fasta'))

fasFH.close()
out_seq_fh.close()
Expand Down Expand Up @@ -126,6 +137,52 @@ def trsk_gene_len_dist(gtf_file, out_file="hist_cds_len.pdf"):
df_cds_len_bin.plot(kind="bar")
plt.savefig(out_file)

def find_max_orf(seq):
""" find optimal start and end for concatenated exons
@args seq: biopython alphabet object

Returns:
False bool Longest ORF was less than 100 nt
Seq BioPython Seq Object Longest start to end substring in seq
"""
seq = str(seq)
start_arr = np.array([ m.start() for m in re.finditer('ATG', seq) ])
taa_list = np.array([ m.start() for m in re.finditer('TAA', seq) ])
tag_list = np.array([ m.start() for m in re.finditer('TAG', seq) ])
tga_list = np.array([ m.start() for m in re.finditer('TGA', seq) ])
end_arr = np.sort(np.hstack((taa_list, tag_list, tga_list)))
if len(start_arr) == 0 or len(end_arr) == 0:
return False
max_dist = 0
max_start = -1
max_end = -1
start_arr_orf = [ int(x % 3) for x in start_arr ]
end_arr_orf = [ int(x % 3) for x in end_arr ]
for frame_ind in [0, 1, 2]:
start_mask = np.array([ val == frame_ind for val in start_arr_orf ])
start_frame = start_arr[start_mask]
end_mask = np.array([ val == frame_ind for val in end_arr_orf ])
end_frame = end_arr[end_mask]
if len(start_frame) != 0 and len(end_frame) != 0:
for start in np.sort(start_frame):
seen_end = False
for end in np.sort(end_frame):
if end < start or seen_end:
next
else:
seen_end = True
dist = end - start
if dist > max_end:
max_start = start
max_end = end
max_dist = dist
next
next

if max_dist < 100:
return False
else:
return Seq(seq[int(max_start):int(max_end) + 3], SingleLetterAlphabet())

if __name__=="__main__":
gname = "hs_chr22.gff"
Expand Down