jackhmmer
是 HMMER 軟件套件中的一個工具,用于進行高敏感度的蛋白質序列比對。HMMER(Hidden Markov Model based on profile)是一套用于分析蛋白質序列的工具,它使用隱藏馬爾可夫模型(HMM)來建模蛋白質家族的多序列比對信息。
以下是 jackhmmer
工具的主要特點和用途:
-
迭代比對:
jackhmmer
是一個迭代的比對工具,它通過在每一輪比對中更新模型,逐步提高比對的靈敏度。這有助于發現相似性較弱的序列關系。 -
Profile比對:
jackhmmer
使用 Profile HMMs(Profile Hidden Markov Models)來表示蛋白質家族的模型。這種模型不僅考慮了氨基酸在給定位置的分布,還考慮了它們之間的相互關系。 -
對比數據庫:
jackhmmer
可以用于在大型蛋白質數據庫(如UniProt數據庫)中搜索相似的序列。用戶可以提供一個查詢序列,然后jackhmmer
將其與數據庫中的序列進行比對,找到相似性較高的序列。 -
靈敏度和特異性:
jackhmmer
的設計旨在在保持高靈敏度的同時,盡量降低假陽性比對的數量,以提高比對的特異性。
from concurrent import futures
import glob
import os
import subprocess
from typing import Set, Optional, Callable, Mapping, Any, Sequence
from urllib import request
from absl import logging
import contextlib
import tempfile
import shutil
import time@contextlib.contextmanager
def timing(msg: str):logging.info('Started %s', msg)tic = time.time()yieldtoc = time.time()logging.info('Finished %s in %.3f seconds', msg, toc - tic)@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):"""Context manager that deletes a temporary directory on exit."""tmpdir = tempfile.mkdtemp(dir=base_dir)try:yield tmpdirfinally:shutil.rmtree(tmpdir, ignore_errors=True)def _keep_line(line: str, seqnames: Set[str]) -> bool:"""Function to decide which lines to keep."""if not line.strip():return Trueif line.strip() == '//': # End tagreturn Trueif line.startswith('# STOCKHOLM'): # Start tagreturn Trueif line.startswith('#=GC RF'): # Reference Annotation Linereturn Trueif line[:4] == '#=GS': # Description lines - keep if sequence in list._, seqname, _ = line.split(maxsplit=2)return seqname in seqnameselif line.startswith('#'): # Other markup - filter outreturn Falseelse: # Alignment data - keep if sequence in list.seqname = line.partition(' ')[0]return seqname in seqnamesdef truncate_stockholm_msa(stockholm_msa_path: str, max_sequences: int) -> str:"""Reads + truncates a Stockholm file while preventing excessive RAM usage."""seqnames = set()filtered_lines = []with open(stockholm_msa_path) as f:for line in f:if line.strip() and not line.startswith(('#', '//')):# Ignore blank lines, markup and end symbols - remainder are alignment# sequence parts.seqname = line.partition(' ')[0]seqnames.add(seqname)if len(seqnames) >= max_sequences:breakf.seek(0)for line in f:if _keep_line(line, seqnames):filtered_lines.append(line)return ''.join(filtered_lines)class Jackhmmer:"""Python wrapper of the Jackhmmer binary."""def __init__(self,*,binary_path: str,database_path: str,n_cpu: int = 8,n_iter: int = 1,e_value: float = 0.0001,z_value: Optional[int] = None,get_tblout: bool = False,filter_f1: float = 0.0005,filter_f2: float = 0.00005,filter_f3: float = 0.0000005,incdom_e: Optional[float] = None,dom_e: Optional[float] = None,num_streamed_chunks: int = 2,streaming_callback: Optional[Callable[[int], None]] = None):"""Initializes the Python Jackhmmer wrapper.Args:binary_path: The path to the jackhmmer executable.database_path: The path to the jackhmmer database (FASTA format).n_cpu: The number of CPUs to give Jackhmmer.n_iter: The number of Jackhmmer iterations.e_value: The E-value, see Jackhmmer docs for more details.z_value: The Z-value, see Jackhmmer docs for more details.get_tblout: Whether to save tblout string.filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.filter_f2: Viterbi pre-filter, set to >1.0 to turn off.filter_f3: Forward pre-filter, set to >1.0 to turn off.incdom_e: Domain e-value criteria for inclusion of domains in MSA/nextround.dom_e: Domain e-value criteria for inclusion in tblout.num_streamed_chunks: Number of database chunks to stream over.streaming_callback: Callback function run after each chunk iteration withthe iteration number as argument."""self.binary_path = binary_pathself.database_path = database_pathself.num_streamed_chunks = num_streamed_chunksif not os.path.exists(self.database_path) and num_streamed_chunks is None:logging.error('Could not find Jackhmmer database %s', database_path)raise ValueError(f'Could not find Jackhmmer database {database_path}')self.n_cpu = n_cpuself.n_iter = n_iterself.e_value = e_valueself.z_value = z_valueself.filter_f1 = filter_f1self.filter_f2 = filter_f2self.filter_f3 = filter_f3self.incdom_e = incdom_eself.dom_e = dom_eself.get_tblout = get_tbloutself.streaming_callback = streaming_callbackdef _query_chunk(self,input_fasta_path: str,database_path: str,max_sequences: Optional[int] = None) -> Mapping[str, Any]:"""Queries the database chunk using Jackhmmer."""#print("in function: _query_chunk")with tmpdir_manager() as query_tmp_dir:sto_path = os.path.join(query_tmp_dir, 'output.sto')# The F1/F2/F3 are the expected proportion to pass each of the filtering# stages (which get progressively more expensive), reducing these# speeds up the pipeline at the expensive of sensitivity. They are# currently set very low to make querying Mgnify run in a reasonable# amount of time.cmd_flags = [# Don't pollute stdout with Jackhmmer output.'-o', '/dev/null','-A', sto_path,'--noali','--F1', str(self.filter_f1),'--F2', str(self.filter_f2),'--F3', str(self.filter_f3),'--incE', str(self.e_value),# Report only sequences with E-values <= x in per-sequence output.'-E', str(self.e_value),'--cpu', str(self.n_cpu),'-N', str(self.n_iter)]if self.get_tblout:tblout_path = os.path.join(query_tmp_dir, 'tblout.txt')cmd_flags.extend(['--tblout', tblout_path])if self.z_value:cmd_flags.extend(['-Z', str(self.z_value)])if self.dom_e is not None:cmd_flags.extend(['--domE', str(self.dom_e)])if self.incdom_e is not None:cmd_flags.extend(['--incdomE', str(self.incdom_e)])#print("input_fasta_path:",input_fasta_path)cmd = [self.binary_path] + cmd_flags + [input_fasta_path, database_path]#print('cmd:',cmd)logging.info('Launching subprocess "%s"', ' '.join(cmd))process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)with timing(f'Jackhmmer ({os.path.basename(database_path)}) query'):_, stderr = process.communicate()retcode = process.wait()if retcode:raise RuntimeError('Jackhmmer failed\nstderr:\n%s\n' % stderr.decode('utf-8'))# Get e-values treamed_chunks or each target nametbl = ''if self.get_tblout:with open(tblout_path) as f:tbl = f.read()if max_sequences is None:with open(sto_path) as f:sto = f.read()else:sto = truncate_stockholm_msa(sto_path, max_sequences)raw_output = dict(sto=sto,tbl=tbl,stderr=stderr,n_iter=self.n_iter,e_value=self.e_value)return raw_outputdef query(self,input_fasta_path: str,max_sequences: Optional[int] = None) -> Sequence[Mapping[str, Any]]:"""Queries the database using Jackhmmer."""return self.query_multiple([input_fasta_path], max_sequences)[0]def query_multiple(self,input_fasta_paths: Sequence[str],max_sequences: Optional[int] = None,) -> Sequence[Sequence[Mapping[str, Any]]]:"""Queries the database for multiple queries using Jackhmmer."""#print("in function: query_multiple")## 1. 只有一個數據庫文件的代碼,不需要多線程 if self.num_streamed_chunks is None:single_chunk_results = []for input_fasta_path in input_fasta_paths:single_chunk_results.append([self._query_chunk(input_fasta_path, self.database_path, max_sequences)])return single_chunk_results#db_basename = os.path.basename(self.database_path)## 2. 數據庫很大,可以分成幾個文件,并行搜索。通過匿名函數生成數據庫的路徑db_chunk = lambda db_idx: f'{self.database_path}.{db_idx}'#print("input_fasta_paths:",input_fasta_paths)# 多線程運行self._query_chunk函數with futures.ThreadPoolExecutor(max_workers=3) as executor:# chunked_outputs 為列表,每個列表中是一個序列搜索結果的字典chunked_outputs = [[] for _ in range(len(input_fasta_paths))]for fasta_index, input_fasta_path in enumerate(input_fasta_paths):# 對每一個fasta序列,多線程運行self._query_chunkm_futures = {executor.submit(self._query_chunk, input_fasta_path, db_chunk(i) , max_sequences): i for i in range(1, self.num_streamed_chunks + 1)} #print("m_futures:")#print(m_futures)# 等待線程結束,寫入數據到chunked_outputsfor future in futures.as_completed(m_futures):data = future.result()chunked_outputs[fasta_index].append(data)return chunked_outputsif __name__ == "__main__":jackhmmer_binary_path = "/home/zheng/anaconda3/envs/deep_learning/bin/jackhmmer"database_path = "/home/zheng/test/test_database/globins45.fa"input_fasta_paths = ["/home/zheng/test/HBB_HUMAN.fa","/home/zheng/test/HBB_LARRI.fa"]print("創建jackhmmer_runner對象:")jackhmmer_runner = Jackhmmer(binary_path = jackhmmer_binary_path,database_path = database_path,num_streamed_chunks = 1)print("開始搜索同源序列:")# 搜索單個序列的同源序列chunked_outputs = jackhmmer_runner.query(input_fasta_paths[1],3)# 搜索多個序列的同源序列#chunked_outputs = jackhmmer_runner.query_multiple(input_fasta_paths,3)print("chunked_outputs:",chunked_outputs)