じゆうけんきゅう : かいしこどんのまえにあるatgのかず

真核生物の翻訳において、教科書的には「mRNAの5’ GCapからMet tRNAが結合したリボソーム小サブユニットがmRNAをスキャンし、最初のAUGで停止して翻訳を開始する」とされるが、 https://www.jstage.jst.go.jp/article/kagakutoseibutsu/54/3/54_191/_pdf にあるように本体の遺伝子のより上流にAUGや終止コドンが存在する場合があるという。

そこで、NCBIの遺伝子データベースからヒトの第一染色体内の遺伝子の一部を自動で取得し、開始コドンの位置と開始コドンより前のAUGの数を計算するプログラムを作った。


import requests
import time
import lxml.etree
import itertools
import re

# search record
def search(db, term, retmax, retstart):
    time.sleep(1)
    return requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", 
        params={"db" : db, "term" : term, "retmax" : retmax, "retstart" : retstart}).content

# fetch record independently
def fetch(db, key):
    time.sleep(1)
    return requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
        params={"db" : db, "id" : key, "retmode" : "xml"}).content

# search and fetch records
def search_and_fetch(db, term, retmax, retstart):
    time.sleep(1)
    search_data = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", 
        params={"db" : db, "term" : term, "retmax" : retmax, "retstart" : retstart, "usehistory" : "y"}).content
    tree = lxml.etree.XML(search_data)
    webenv = tree.find("WebEnv").text
    query_key = tree.find("QueryKey").text
    fetch_data = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", 
        params={"db" : db, "retmax" : retmax, "retstart" : retstart, "query_key" : query_key, "WebEnv" : webenv, "retmode" : "xml"}).content
    return fetch_data

# get all transcription variants and corresponding peptides
#    from an Entrezgene XML element
# returns : list of mrna IDs, list of list of peptide IDs
def get_all_transcription_variants(entrezgene):
    mrna_list = []
    protein_list_list = []
    for mRNA in entrezgene.find("Entrezgene_locus").findall("Gene-commentary/Gene-commentary_products/"):
        protein_list = []
        for peptide in mRNA.findall("Gene-commentary_products/"):
            accession = peptide.find("Gene-commentary_accession")
            if accession == None:
                continue
            protein_list.append(accession.text)
        mrna_list.append(mRNA.find("Gene-commentary_accession").text)
        protein_list_list.append(protein_list)
    return mrna_list, protein_list_list

# search gene ids
def search_gene_ids(query, num=100, start=0):
    index_data = search("gene", query, num, start)

    tree = lxml.etree.XML(index_data)

    return list(map(lambda x: x.text, index_data.findall("IdList/")))    

# get all gene data from search result
def get_all_gene_data(term, retmax, retstart):
    result = search_and_fetch("gene", term, retmax, retstart)
    mrna_list = []
    for gene in lxml.etree.XML(result).findall("Entrezgene"):
        mrna_list += get_all_transcription_variants(gene)[0]
    nuccore_data = fetch("nuccore", ",".join(mrna_list))
    tree = lxml.etree.XML(nuccore_data)
    mrna_list = []
    for mrna in tree.findall('GBSeq'):
        mrna_id = mrna.find("GBSeq_locus").text
        transcription = mrna.find("GBSeq_sequence").text
        peptide_list = []
        for label in mrna.xpath('GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="translation"]'):
            protein_id = label.xpath('../../GBQualifier/GBQualifier_name[text()="protein_id"]')[0].find("../GBQualifier_value").text
            translation = label.find("../GBQualifier_value").text
            peptide_list.append([protein_id, translation])
        mrna_list.append([mrna_id, transcription, peptide_list])
    return mrna_list

codon_table = dict(zip(
    map("".join, [a + b + c for a in "tcag" for b in "tcag" for c in "tcag"]),
    'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'))

inv_codon_table = {}
for k, v in codon_table.items():
    if v not in inv_codon_table:
        inv_codon_table[v] = [k]
    else:
        inv_codon_table[v] += [k]

def reverse_translate_regex(peptide):
    return "".join(map(lambda x: "(" + "|".join(inv_codon_table[x]) + ")", peptide))

def get_initiation_pos(rna, peptide):
    return re.search(reverse_translate_regex(peptide), rna).regs[0][0] + 1

genedata = get_all_gene_data("NC_000001.11[Nucleotide Accession]", 10, 200)

positions = []

for rna in genedata:
    rna_id = rna[0]
    rna_sequence = rna[1]
    for peptide in rna[2]:
        peptide_id = peptide[0]
        peptide_sequence = peptide[1]
        try:
            pos = get_initiation_pos(rna_sequence, peptide_sequence)
        except:
            pos = None
        try:
            num_of_atgs_before_init = len(re.findall("atg", rna_sequence[:pos - 1]))
        except:
            num_of_atgs_before_init = None
        positions.append([rna_id, peptide_id, pos, num_of_atgs_before_init])

print(positions)

Output(pprint) : mRNA id, peptide id, 開始コドンの位置, 開始コドンより前のAUG個数

[['NM_001146068', 'NP_001139540.1', 237, 3],
 ['NM_001748', 'NP_001739.3', 103, 0],
 ['XM_017002189', 'XP_016857678.1', 237, 2],
 ['NM_001290403', 'NP_001277332.1', 327, 3],
 ['NM_003189', 'NP_003180.1', 445, 3],
 ['XM_017002193', 'XP_016857682.1', 271, 3],
 ['XM_017002191', 'XP_016857680.1', 58, 1],
 ['XM_017002190', 'XP_016857679.1', 933, 8],
 ['NM_001290405', 'NP_001277334.1', 200, 2],
 ['NM_001290404', 'NP_001277333.1', 332, 3],
 ['NM_001287347', 'NP_001274276.1', 217, 1],
 ['XM_017002188', 'XP_016857677.1', 434, 4],
 ['XM_017002187', 'XP_016857676.1', 1242, 11],
 ['XM_005271160', 'XP_005271217.1', 1827, 13],
 ['NM_001290406', 'NP_001277335.1', 91, 2],
 ['XM_017002192', 'XP_016857681.1', 421, 4],
 ['XM_017002027', 'XP_016857516.1', 543, 8],
 ['NM_000329', 'NP_000320.1', 50, 0],
 ['NM_000329', 'NP_000320.1', 50, 0],
 ['NM_004905', 'NP_004896.1', 64, 0],
 ['NM_203342', 'NP_976217.1', 835, 5],
 ['NM_001166005', 'NP_001159477.1', 128, 0],
 ['NM_203343', 'NP_976218.1', 119, 0],
 ['NM_004437', 'NP_004428.1', 809, 4],
 ['NM_001166007', 'NP_001159479.1', 729, 3],
 ['XM_011540964', 'XP_011539266.1', 726, 3],
 ['XM_017000595', 'XP_016856084.1', 726, 3],
 ['XM_017000594', 'XP_016856083.1', 726, 3],
 ['XM_017000596', 'XP_016856085.1', 726, 3],
 ['XM_017000593', 'XP_016856082.1', 726, 3],
 ['XM_017000597', 'XP_016856086.1', 726, 3],
 ['XM_024453880', 'XP_024309648.1', 726, 3],
 ['XM_017000599', 'XP_016856088.1', 726, 3],
 ['XM_017000598', 'XP_016856087.1', 726, 3],
 ['XM_017000600', 'XP_016856089.1', 726, 3],
 ['XM_017000603', 'XP_016856092.1', 726, 3],
 ['XM_017000602', 'XP_016856091.1', 726, 3],
 ['XM_017000604', 'XP_016856093.1', 726, 3],
 ['XM_017000584', 'XP_016856073.1', 495, 7],
 ['XM_017000585', 'XP_016856074.1', 492, 7],
 ['XM_011540956', 'XP_011539258.1', 203, 0],
 ['XM_011540958', 'XP_011539260.1', 203, 0],
 ['XM_005245760', 'XP_005245817.1', 203, 0],
 ['XM_006710434', 'XP_006710497.1', 203, 0],
 ['XM_006710439', 'XP_006710502.1', 203, 0],
 ['XM_005245753', 'XP_005245810.1', 203, 0],
 ['XM_005245761', 'XP_005245818.1', 203, 0],
 ['XM_005245768', 'XP_005245825.1', 203, 0],
 ['XM_011540959', 'XP_011539261.1', 203, 0],
 ['XM_005245763', 'XP_005245820.1', 203, 0],
 ['XM_011540961', 'XP_011539263.1', 203, 0],
 ['XM_017000586', 'XP_016856075.1', 203, 0],
 ['XM_005245757', 'XP_005245814.1', 203, 0],
 ['XM_005245765', 'XP_005245822.1', 203, 0],
 ['XM_011540960', 'XP_011539262.1', 203, 0],
 ['XM_005245764', 'XP_005245821.1', 203, 0],
 ['XM_011540962', 'XP_011539264.1', 203, 0],
 ['XM_005245769', 'XP_005245826.1', 203, 0],
 ['XM_005245774', 'XP_005245831.1', 203, 0],
 ['XM_011540957', 'XP_011539259.1', 203, 0],
 ['XM_017000583', 'XP_016856072.1', 203, 0],
 ['XM_005245770', 'XP_005245827.1', 203, 0],
 ['XM_017000581', 'XP_016856070.1', 203, 0],
 ['XM_017000589', 'XP_016856078.1', 203, 0],
 ['XM_017000582', 'XP_016856071.1', 203, 0],
 ['XM_017000590', 'XP_016856079.1', 203, 0],
 ['XM_005245772', 'XP_005245829.1', 203, 0],
 ['XM_011540963', 'XP_011539265.1', 203, 0],
 ['XM_011540965', 'XP_011539267.1', 203, 0],
 ['XM_005245773', 'XP_005245830.1', 203, 0],
 ['XM_017000591', 'XP_016856080.1', 203, 0],
 ['XM_017000587', 'XP_016856076.1', 203, 0],
 ['XM_017000588', 'XP_016856077.1', 203, 0],
 ['XM_017000592', 'XP_016856081.1', 203, 0],
 ['NM_001166006', 'NP_001159478.1', 201, 0],
 ['NM_203342', 'NP_976217.1', 835, 5],
 ['NM_006610', 'NP_006601.2', 33, 0],
 ['XM_017000097', 'XP_016855586.1', 33, 0],
 ['NM_139208', 'NP_631947.1', 33, 0],
 ['NM_006610', 'NP_006601.2', 33, 0],
 ['NM_139208', 'NP_631947.1', 33, 0],
 ['NM_004759', 'NP_004750.1', 326, 0],
 ['NM_032960', 'NP_116584.2', 326, 0],
 ['XM_005273353', 'XP_005273410.1', 319, 0],
 ['XM_017002810', 'XP_016858299.1', 156, 0],
 ['NM_032960', 'NP_116584.2', 326, 0],
 ['NM_001127651', 'NP_001121123.1', 106, 0],
 ['XM_011509580', 'XP_011507882.1', 114, 0],
 ['NM_001190789', 'NP_001177718.1', 276, 3],
 ['NM_001190794', 'NP_001177723.1', 276, 3],
 ['NM_000433', 'NP_000424.2', 276, 3],
 ['XM_011509581', 'XP_011507883.1', 101, 2],
 ['XM_005245207', 'XP_005245264.1', 205, 1],
 ['NM_000433', 'NP_000424.2', 276, 3],
 ['NM_001514', 'NP_001505.1', 69, 0],
 ['XM_011541299', 'XP_011539601.1', 415, 6]]

結果としては、かなり多くの場合で開始コドンよりまえにAUGがあることがわかった。