# -*- coding: utf-8 -*-
#
# seqann Sequence Annotation.
# Copyright (c) 2017 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 3 of the License, or (at
# your option) any later version.
#
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
# License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
#
# > http://www.fsf.org/licensing/licenses/lgpl.html
# > http://www.opensource.org/licenses/lgpl-license.php
#
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast.Applications import NcbiblastnCommandline
from seqann.util import cleanup
from seqann.util import randomid
from seqann.models.blast import Blast
from seqann.models.reference_data import ReferenceData
import logging
import re
# TODO: Move to util.py
has_hla = lambda x: True if re.search("HLA", x) else False
[docs]def blastn(sequences, locus, nseqs, kir=False,
verbose=False, refdata=None, evalue=10):
"""
Gets the a list of alleles that are the most similar to the input sequence
:param sequences: The input sequence record.
:type sequences: SeqRecord
:param locus: The gene locus associated with the sequence.
:type locus: ``str``
:param nseqs: The incomplete annotation from a previous iteration.
:type nseqs: ``int``
:param evalue: The evalue to use (default = 10)
:type evalue: ``int``
:param kir: Run with KIR or not
:type kir: ``bool``
:param verbose: Run in versboe
:type verbose: ``bool``
:param refdata: An object with reference data
:type refdata: :ref:`ref`
:rtype: :ref:`bl`
Example usage:
>>> from Bio.Seq import Seq
>>> from seqann.blast_cmd import blastn
>>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
>>> blast = blastn(sequence, locus, nseqs)
"""
logger = logging.getLogger("Logger." + __name__)
if not refdata:
refdata = ReferenceData()
file_id = str(randomid())
input_fasta = file_id + ".fasta"
output_xml = file_id + ".xml"
SeqIO.write(sequences, input_fasta, "fasta")
blastn_cline = NcbiblastnCommandline(query=input_fasta,
db=refdata.blastdb,
evalue=evalue,
outfmt=5,
reward=1,
penalty=-3,
gapopen=5,
gapextend=2,
dust='yes',
out=output_xml)
stdout, stderr = blastn_cline()
loc = locus
if not kir:
loc = locus.split("-")[1]
blast_qresult = SearchIO.read(output_xml, 'blast-xml')
# Delete files
cleanup(file_id)
# TODO: Use logging
if len(blast_qresult.hits) == 0:
if verbose:
logger.error("Failed blast! No hits!")
logger.error(stderr)
return Blast(failed=True)
alleles = []
full_sequences = []
load_blast = 70 if nseqs < 70 else nseqs
l = len(blast_qresult.hits) if load_blast > len(blast_qresult.hits) else load_blast
# TODO: update all blast files to have HLA-
if locus in refdata.hla_loci and not kir:
alleles = [blast_qresult[i].id.split("_")[0] for i in range(0, l)
if blast_qresult[i].id.split("*")[0] == locus or "HLA-" + blast_qresult[i].id.split("*")[0] == locus]
alleles = ["HLA-" + a if not has_hla(a) else a for a in alleles]
if kir:
alleles = [blast_qresult[i].id.split("_")[0] for i in range(0, l)
if blast_qresult[i].id.split("*")[0] == locus]
if verbose:
logger.info("Blast alleles: " + ",".join(alleles))
# TODO: sort alleles by number of features they contain and evalue
# Use biosql db if provided
# otherwise use IMGT dat file
final_seqs = []
rmax = refdata.structure_max[locus]
if refdata.server_avail:
db = refdata.server[refdata.dbversion + "_" + loc]
full_sequences = []
for n in alleles:
if n in refdata.hla_names:
try:
seq = db.lookup(name=n)
full_sequences.append(seq)
except:
logger.error("Allele doesnt exist in IMGT BioSQL DB!! "
+ n)
else:
if verbose:
logger.info("Getting sequences from HLA.dat file")
full_sequences = [refdata.hlaref[a] for a in alleles
if a in refdata.hlaref]
for s in full_sequences:
s.name = s.description.split(",")[0]
i = 1
last_seq = []
max_f = 0
added_max = False
full_feats = False
for s in full_sequences:
fs = len([f.type for f in s.features
if not f.type in ['source', 'CDS']])
if i <= nseqs:
final_seqs.append(s)
max_f = fs if fs > max_f else max_f
if i <= nseqs and max_f < rmax:
full_feats = True
if(i >= nseqs and fs == max_f and not added_max):
if len(last_seq) >= 10:
last_seq.insert(3, s)
else:
last_seq.append(s)
added_max = True
if(fs > max_f and len(last_seq) < 10
and i >= nseqs and len(last_seq) < 10):
last_seq.append(s)
i += 1
if full_feats:
for s in last_seq:
final_seqs.append(s)
# Build Blast object
blast_o = Blast(match_seqs=final_seqs, alleles=alleles)
return blast_o
[docs]def get_locus(sequences, kir=False, verbose=False, refdata=None, evalue=10):
"""
Gets the locus of the sequence by running blastn
:param sequences: sequenences to blast
:param kir: bool whether the sequences are KIR or not
:rtype: ``str``
Example usage:
>>> from Bio.Seq import Seq
>>> from seqann.blast_cmd import get_locus
>>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
>>> locus = get_locus(sequence)
"""
if not refdata:
refdata = ReferenceData()
file_id = str(randomid())
input_fasta = file_id + ".fasta"
output_xml = file_id + ".xml"
SeqIO.write(sequences, input_fasta, "fasta")
blastn_cline = NcbiblastnCommandline(query=input_fasta,
db=refdata.blastdb,
evalue=evalue,
outfmt=5,
reward=1,
penalty=-3,
gapopen=5,
gapextend=2,
dust='yes',
out=output_xml)
stdout, stderr = blastn_cline()
blast_qresult = SearchIO.read(output_xml, 'blast-xml')
# Delete files
cleanup(file_id)
if len(blast_qresult.hits) == 0:
return ''
loci = []
for i in range(0, 3):
if kir:
loci.append(blast_qresult[i].id.split("*")[0])
else:
loci.append(blast_qresult[i].id.split("*")[0])
locus = set(loci)
if len(locus) == 1:
if has_hla(loci[0]) or kir:
return loci[0]
else:
return "HLA-" + loci[0]
else:
return ''