From 965d00e8e5dc6b52b13f16c9cc1baaf14c27b5b8 Mon Sep 17 00:00:00 2001 From: David Kuo Date: Mon, 14 Dec 2015 16:41:24 -0500 Subject: [PATCH 1/2] update with orf --- proteome_analysis/translate_trsk_genes.py | 124 ++++++++++++++++------ 1 file changed, 90 insertions(+), 34 deletions(-) diff --git a/proteome_analysis/translate_trsk_genes.py b/proteome_analysis/translate_trsk_genes.py index aacab15..c791745 100644 --- a/proteome_analysis/translate_trsk_genes.py +++ b/proteome_analysis/translate_trsk_genes.py @@ -10,16 +10,16 @@ import pandas as pd from Bio import SeqIO from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +from Bio.Alphabet import SingleLetterAlphabet from gfftools import GFFParser, helper from signal_labels import generate_genome_seq_labels as seqlab +import pdb +import os +import re - -def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): - """ - translate the trsk genes to protein sequence - - @args gtf_file: genome annotation file - @type gtf_file: str +def translate_trsk_genes(gtf_file, fas_file, out_seq_fname, max_orf_opt = False): + """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format @@ -37,36 +37,46 @@ def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): ## genome sequence file reading sys.stdout.write('reading genome sequence from %s\n' % fas_file) seqlab.chrom_name_consistency(fas_file, anno_db) - - cds_idx = [] # deleting the empty cds lines - for idp, feat in enumerate(anno_db): - if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region - cds_idx.append(idp) - anno_db = np.delete(anno_db, cds_idx) - genes_with_cds = len(anno_db) - - fasFH = helper.open_file(fas_file) - out_seq_fh = open(out_seq_fname, "w") - for rec in SeqIO.parse(fasFH, "fasta"): + cds_idx = [] + if not max_orf_opt: + for idp, feat in enumerate(anno_db): + if not feat['cds_exons'][0].any(): + cds_idx.append(idp) + else: + for idp, feat in enumerate(anno_db): + if not feat['exons'][0].any(): + cds_idx.append(idp) + anno_db = np.delete(anno_db, cds_idx) + genes_with_cds = 0 + + fasFH = helper.open_file(fas_file) + out_seq_fh = open(out_seq_fname, 'w') + for rec in SeqIO.parse(fasFH, 'fasta'): for idx, feature in enumerate(anno_db): if rec.id == feature['chr']: - ## iterate over cds_exons cds_seq = '' - for ex in feature['cds_exons'][0]:## single transcript by TSkim - cds_seq += rec.seq[ex[0]-1:ex[1]] - - if feature['strand'] == '-': - cds_seq = cds_seq.reverse_complement() - ## - #sys.stdout.write(str(cds_seq.translate()) + "\n") - - ## fasta output - if cds_seq: - prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') - out_seq_fh.write(prt_seq.format("fasta")) - - # FIXME need an efficient way to translate multiple gene - # iterate over chromosome + if len(feature['cds_exons'][0]) == 0: + max_orf_opt = True + if not max_orf_opt: + for ex in feature['cds_exons'][0]: + cds_seq += rec.seq[ex[0] - 1:ex[1]] + + if feature['strand'] == '-': + cds_seq = cds_seq.reverse_complement() + if cds_seq: + prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') + out_seq_fh.write(prt_seq.format('fasta')) + else: + start = int(feature['exons'][0][0][0]) + end = int(feature['exons'][0][0][1] + 1) + cds_seq += rec.seq[start:end] + if feature['strand'] == '-': + cds_seq = cds_seq.reverse_complement() + cds_seq = find_max_orf(cds_seq) + if cds_seq: + genes_with_cds += 1 + prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') + out_seq_fh.write(prt_seq.format('fasta')) fasFH.close() out_seq_fh.close() @@ -126,6 +136,52 @@ def trsk_gene_len_dist(gtf_file, out_file="hist_cds_len.pdf"): df_cds_len_bin.plot(kind="bar") plt.savefig(out_file) +def find_max_orf(seq): + """ find optimal start and end for concatenated exons + @args seq: biopython alphabet object + + Returns: + False bool Longest ORF was less than 100 nt + Seq BioPython Seq Object Longest start to end substring in seq + """ + seq = str(seq) + start_arr = np.array([ m.start() for m in re.finditer('ATG', seq) ]) + taa_list = np.array([ m.start() for m in re.finditer('TAA', seq) ]) + tag_list = np.array([ m.start() for m in re.finditer('TAG', seq) ]) + tga_list = np.array([ m.start() for m in re.finditer('TGA', seq) ]) + end_arr = np.sort(np.hstack((taa_list, tag_list, tga_list))) + if len(start_arr) == 0 or len(end_arr) == 0: + return False + max_dist = 0 + max_start = -1 + max_end = -1 + start_arr_orf = [ int(x % 3) for x in start_arr ] + end_arr_orf = [ int(x % 3) for x in end_arr ] + for frame_ind in [0, 1, 2]: + start_mask = np.array([ val == frame_ind for val in start_arr_orf ]) + start_frame = start_arr[start_mask] + end_mask = np.array([ val == frame_ind for val in end_arr_orf ]) + end_frame = end_arr[end_mask] + if len(start_frame) != 0 and len(end_frame) != 0: + for start in np.sort(start_frame): + seen_end = False + for end in np.sort(end_frame): + if end < start or seen_end: + next + else: + seen_end = True + dist = end - start + if dist > max_end: + max_start = start + max_end = end + max_dist = dist + next + next + + if max_dist < 100: + return False + else: + return Seq(seq[int(max_start):int(max_end) + 3], SingleLetterAlphabet()) if __name__=="__main__": gname = "hs_chr22.gff" From cefbf8c40c967aa061fd00cfd6498c2f46539263 Mon Sep 17 00:00:00 2001 From: David Kuo Date: Mon, 14 Dec 2015 16:43:04 -0500 Subject: [PATCH 2/2] whitespace update --- proteome_analysis/translate_trsk_genes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/proteome_analysis/translate_trsk_genes.py b/proteome_analysis/translate_trsk_genes.py index c791745..99b21a1 100644 --- a/proteome_analysis/translate_trsk_genes.py +++ b/proteome_analysis/translate_trsk_genes.py @@ -14,12 +14,13 @@ from Bio.Alphabet import SingleLetterAlphabet from gfftools import GFFParser, helper from signal_labels import generate_genome_seq_labels as seqlab -import pdb import os import re def translate_trsk_genes(gtf_file, fas_file, out_seq_fname, max_orf_opt = False): - """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str + """ translate the trsk genes to protein sequence + @args gtf_file: genome annotation file + @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format