蛋白質序列FeatureDict轉化為TensorDict

主要轉化語句為 tensor_dict = {k: tf.constant(v) for k, v in np_example.items() if k in features_metadata}。?增加了特征名稱的選擇,不同特征維度,特征數的判斷等。

from typing import Dict, Tuple, Sequence, Union, Mapping, Optional
#import tensorflow.compat.v1 as tf
import tensorflow as tf
import numpy as np
import pickle# Type aliases.
FeaturesMetadata = Dict[str, Tuple[tf.dtypes.DType, Sequence[Union[str, int]]]]#FeatureDict = Mapping[str, np.ndarray]
TensorDict = Dict[str, tf.Tensor]NUM_RES = 'num residues placeholder'
NUM_TEMPLATES = 'num templates placeholder'
NUM_SEQ = "length msa placeholder"atom_type_num = 37FEATURES = {#### Static features of a protein sequence ####"aatype": (tf.float32, [NUM_RES, 21]),"between_segment_residues": (tf.int64, [NUM_RES, 1]),"deletion_matrix": (tf.float32, [NUM_SEQ, NUM_RES, 1]),"domain_name": (tf.string, [1]),"msa": (tf.int64, [NUM_SEQ, NUM_RES, 1]),"num_alignments": (tf.int64, [NUM_RES, 1]),"residue_index": (tf.int64, [NUM_RES, 1]),"seq_length": (tf.int64, [NUM_RES, 1]),"sequence": (tf.string, [1]),"all_atom_positions": (tf.float32,[NUM_RES, atom_type_num, 3]),"all_atom_mask": (tf.int64, [NUM_RES, atom_type_num]),"resolution": (tf.float32, [1]),"template_domain_names": (tf.string, [NUM_TEMPLATES]),"template_sum_probs": (tf.float32, [NUM_TEMPLATES, 1]),"template_aatype": (tf.float32, [NUM_TEMPLATES, NUM_RES, 22]),"template_all_atom_positions": (tf.float32, [NUM_TEMPLATES, NUM_RES, atom_type_num, 3]),"template_all_atom_masks": (tf.float32, [NUM_TEMPLATES, NUM_RES, atom_type_num, 1]),
}def _make_features_metadata(feature_names: Sequence[str]) -> FeaturesMetadata:"""Makes a feature name to type and shape mapping from a list of names."""# Make sure these features are always read.required_features = ["aatype", "sequence", "seq_length"]feature_names = list(set(feature_names) | set(required_features))features_metadata = {name: FEATURES[name] for name in feature_names}return features_metadatadef np_to_tensor_dict(np_example: Mapping[str, np.ndarray],features: Sequence[str],) -> TensorDict:"""Creates dict of tensors from a dict of NumPy arrays.Args:np_example: A dict of NumPy feature arrays.features: A list of strings of feature names to be returned in the dataset.Returns:A dictionary of features mapping feature names to features. Only the givenfeatures are returned, all other ones are filtered out."""features_metadata = _make_features_metadata(features)print(f"features_metadata:{features_metadata}")tensor_dict = {k: tf.constant(v) for k, v in np_example.items()if k in features_metadata}#print(f"tensor_dict:{tensor_dict}")# Ensures shapes are as expected. Needed for setting size of empty features# e.g. when no template hits were found.tensor_dict = parse_reshape_logic(tensor_dict, features_metadata)return tensor_dictdef protein_features_shape(feature_name: str,num_residues: int,msa_length: int,num_templates: Optional[int] = None,features: Optional[FeaturesMetadata] = None):"""Get the shape for the given feature name.This is near identical to _get_tf_shape_no_placeholders() but with 2differences:* This method does not calculate a single placeholder from the total number ofelements (eg given <NUM_RES, 3> and size := 12, this won't deduce NUM_RESmust be 4)* This method will work with tensorsArgs:feature_name: String identifier for the feature. If the feature name endswith "_unnormalized", this suffix is stripped off.num_residues: The number of residues in the current domain - some elementsof the shape can be dynamic and will be replaced by this value.msa_length: The number of sequences in the multiple sequence alignment, someelements of the shape can be dynamic and will be replaced by this value.If the number of alignments is unknown / not read, please pass None formsa_length.num_templates (optional): The number of templates in this tfexample.features: A feature_name to (tf_dtype, shape) lookup; defaults to FEATURES.Returns:List of ints representation the tensor size.Raises:ValueError: If a feature is requested but no concrete placeholder value isgiven."""features = features or FEATURESif feature_name.endswith("_unnormalized"):feature_name = feature_name[:-13]# features是FeaturesMetadata數據結構# FeaturesMetadata = Dict[str, Tuple[tf.dtypes.DType, Sequence[Union[str, int]]]]unused_dtype, raw_sizes = features[feature_name]#print(f"feature_name:{feature_name}")#print(f"features value:{features[feature_name]}") #print(f"features[feature_name]:{features[feature_name]}")#print(f"unused_dtype:{unused_dtype}")#print(f"raw_sizes:{raw_sizes}"replacements = {NUM_RES: num_residues,NUM_SEQ: msa_length}if num_templates is not None:replacements[NUM_TEMPLATES] = num_templates# my_dict.get(key, default_value)sizes = [replacements.get(dimension, dimension) for dimension in raw_sizes]for dimension in sizes:if isinstance(dimension, str):raise ValueError("Could not parse %s (shape: %s) with values: %s" % (feature_name, raw_sizes, replacements))return sizesdef parse_reshape_logic(parsed_features: TensorDict,features: FeaturesMetadata,key: Optional[str] = None) -> TensorDict:"""Transforms parsed serial features to the correct shape."""# Find out what is the number of sequences and the number of alignments.num_residues = tf.cast(_first(parsed_features["seq_length"]), dtype=tf.int32)if "num_alignments" in parsed_features:num_msa = tf.cast(_first(parsed_features["num_alignments"]), dtype=tf.int32)else:num_msa = 0if "template_domain_names" in parsed_features:num_templates = tf.cast(tf.shape(parsed_features["template_domain_names"])[0], dtype=tf.int32)else:num_templates = 0if key is not None and "key" in features:parsed_features["key"] = [key]  # Expand dims from () to (1,).# Reshape the tensors according to the sequence length and num alignments.for k, v in parsed_features.items():new_shape = protein_features_shape(feature_name=k,num_residues=num_residues,msa_length=num_msa,num_templates=num_templates,features=features)#print(f"new_shape:{new_shape}")new_shape_size = tf.constant(1, dtype=tf.int32)for dim in new_shape:new_shape_size *= tf.cast(dim, tf.int32)#print(f"new_shape_size:{new_shape_size}")#print(f"original_shape_size:{ tf.size(v)}")# 斷言函數,用于檢查兩個張量是否相等。不相等引發異常assert_equal = tf.assert_equal(tf.size(v), new_shape_size,name="assert_%s_shape_correct" % k,message="The size of feature %s (%s) could not be reshaped ""into %s" % (k, tf.size(v), new_shape))if "template" not in k:# Make sure the feature we are reshaping is not empty.assert_non_empty = tf.assert_greater(tf.size(v), 0, name="assert_%s_non_empty" % k,message="The feature %s is not set in the tf.Example. Either do not ""request the feature or use a tf.Example that has the ""feature set." % k)with tf.control_dependencies([assert_non_empty, assert_equal]):parsed_features[k] = tf.reshape(v, new_shape, name="reshape_%s" % k)else:with tf.control_dependencies([assert_equal]):parsed_features[k] = tf.reshape(v, new_shape, name="reshape_%s" % k)return parsed_featuresdef _first(tensor: tf.Tensor) -> tf.Tensor:"""Returns the 1st element - the input can be a tensor or a scalar."""return tf.reshape(tensor, shape=(-1,))[0] # 將其轉換為一維數組## 讀入FeatureDict列表
with open("HBB_features_lst.pkl", 'rb') as f:HBB_features_lst = pickle.load(f)Human_HBB_feature_dict = HBB_features_lst[0]print(Human_HBB_feature_dict.keys())#print(Human_HBB_feature_dict['num_alignments'])features = FEATURES.keys()#for key in Human_HBB_feature_dict.keys():
#    if key not in features:
#        print(key)#print(features)Human_HBB_tensor_dict = np_to_tensor_dict(Human_HBB_feature_dict,features= features)print(Human_HBB_tensor_dict.keys())
#print(Human_HBB_tensor_dict)#print(Human_HBB_tensor_dict["template_domain_names"])

本文來自互聯網用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務,不擁有所有權,不承擔相關法律責任。
如若轉載,請注明出處:http://www.pswp.cn/news/209609.shtml
繁體地址,請注明出處:http://hk.pswp.cn/news/209609.shtml
英文地址,請注明出處:http://en.pswp.cn/news/209609.shtml

如若內容造成侵權/違法違規/事實不符,請聯系多彩編程網進行投訴反饋email:809451989@qq.com,一經查實,立即刪除!

相關文章

postgresql_conf中常用配置項

在 PostgreSQL 的 postgresql.conf 配置文件中&#xff0c;有許多常用的配置項&#xff0c;這些配置項可以根據特定需求和性能優化進行調整。以下是一些常用的配置項及其作用&#xff1a; 1. shared_buffers 用于設置 PostgreSQL 實例使用的共享內存緩沖區大小。增加此值可以…

游戲被攻擊該怎么辦?游戲盾該如何使用,游戲盾如何防護攻擊

隨著Internet互聯網絡帶寬的增加和多種DDOS黑客工具的不斷發布&#xff0c;DDOS拒絕服務攻擊的實施越來越容易&#xff0c;DDOS攻擊事件正在成上升趨勢。出于商業競爭、打擊報復和網絡敲詐等多種因素&#xff0c;導致很多商業站點、游戲服務器、聊天網絡等網絡服務商長期以來一…

Nacos 配置加密功能也太雞肋了吧,有種更好的方式

大家好&#xff0c;我是風箏&#xff0c;微信搜「古時的風箏」&#xff0c;更多干貨 當項目中用了 Nacos 做配置中心&#xff0c;是不是所有的配置都放到里面呢&#xff0c;大部分時候為了省事和統一&#xff0c;系統所有的配置都直接放在里面了&#xff0c;有時候&#xff0c…

什么是自動化測試框架?常用的自動化測試框架有哪些?

無論是在自動化測試實踐&#xff0c;還是日常交流中&#xff0c;經常聽到一個詞&#xff1a;框架。之前學習自動化測試的過程中&#xff0c;一直對“框架”這個詞知其然不知其所以然。 最近看了很多自動化相關的資料&#xff0c;加上自己的一些實踐&#xff0c;算是對“框架”…

Redis相關知識

yum安裝redis 使用以下命令&#xff1a;直接將redis安裝到Linux服務器&#xff08;Xshell&#xff09;中 yum -y install redis 啟動redis 使用以下命令&#xff0c;以后臺運行方式啟動redis redis-server /etc/redis.conf & 操作redis 使用以下命令啟動redis客戶端 redis-…

RFID在新能源工廠大放異彩

RFID在新能源工廠大放異彩 我國在十四五規劃中提出了建設綠色低碳發展的目標&#xff0c;新能源產業成為了國家發展的重點領域之一&#xff0c;開始大力支持各種新能源廠商發展。各個廠商之間不僅比產品、比技術。也比生產想要降本增效&#xff0c;為了實現這一目標&#xff0…

MBD Introduction

介紹 MATLAB是MathWorks公司的商業數學軟件&#xff0c;應用于科學計算、可視化以及交互式程序設計等高科技計算環境。Simulink是MATLAB中的一種可視化仿真工具。 Simulink是一個模塊圖環境&#xff0c;用于多域仿真以及基于模型的設計。它支持系統設計、仿真、自動代碼生成以…

Spring基于xml半注解開發

目錄 Component的使用 依賴注解的使用 非自定義Bean的注解開發 Component的使用 基本Bean注解&#xff0c;主要是使用注解的方式替代原有的xml的<bean>標簽及其標簽屬性的配置&#xff0c;使用Component注解替代<bean>標簽中的id以及class屬性&#xff0c;而對…

算法Day26 數位統計

數位統計 Description 給你一個整數n&#xff0c;統計并返回各位數字都不同的數字x的個數&#xff0c;其中0 ≤ x < 10^n。 Input 輸入整數n 0≤n≤13 Output 輸出整數個數 Sample 代碼 import java.util.Scanner;public class Main {public static void main(String[] ar…

一個Oracle Application Container的實例

本例基本涵蓋了Oracle Multitenant功能中application container的以下內容&#xff1a; 創建application container/root創建application PDB創建application SEED在application root中安裝application在application root中升級application同步application 整個過程如下 創建…

Epoll服務器(ET工作模式)

目錄 Epoll ET服務器設計思路Connection類TcpServer類 回調函數Accepter函數Recever函數Sender函數Excepter函數 事件處理套接字相關接口封裝運行Epoll服務器 Epoll ET服務器 設計思路 在epoll ET服務器中&#xff0c;我們需要處理如下幾種事件&#xff1a; 讀事件&#xff…

基于javeweb實現的圖書借閱管理系統

一、系統架構 前端&#xff1a;jsp | js | css | jquery 后端&#xff1a;servlet | jdbc 環境&#xff1a;jdk1.7 | mysql | tocmat 二、代碼及數據庫 三、功能介紹 01. 登錄頁 02. 首頁 03. 圖書管理 04. 讀者管理 05. 圖書分類管理 06. 圖書借閱信息 07. 圖書歸還信…

CDN加速技術:降低服務器與網站成本的智慧選擇

隨著互聯網的飛速發展&#xff0c;網站的訪問量不斷攀升&#xff0c;服務器負載壓力逐漸增大。為了提高用戶體驗、降低服務器成本&#xff0c;并確保網站的高可用性&#xff0c;CDN&#xff08;內容分發網絡&#xff09;加速技術應運而生。本文將從服務器與網站成本的角度分析C…

NLP項目實戰01--電影評論分類

介紹&#xff1a; 歡迎來到本篇文章&#xff01;在這里&#xff0c;我們將探討一個常見而重要的自然語言處理任務——文本分類。具體而言&#xff0c;我們將關注情感分析任務&#xff0c;即通過分析電影評論的情感來判斷評論是正面的、負面的。 展示&#xff1a; 訓練展示如下…

比較不同聚類方法的評估指標

歸一化互信息&#xff08;NMI&#xff09; 要求&#xff1a;需要每個序列的真實標簽&#xff08;分類信息&#xff09;

你在地鐵上修過bug嗎?

作為技術人員&#xff0c;有沒有遇到下班路上收到老板電話&#xff0c;系統故障&#xff0c;然后地鐵上掏出電腦&#xff0c;修bug的場景。自己負責的業務線上出現問題&#xff0c;負責人心里是很慌的&#xff0c;在這種心理狀態下做事很容易二次犯錯&#xff0c;造成更大的問題…

SAP UI5 walkthrough step10 Descriptor for Applications

在這一步&#xff0c;我們將會把所有的應用相關的描述性的文件獨立放到manifest.json 新建一個manifest.json文件 webapp/manifest.json (New) {"_version": "1.58.0","sap.app": {"id": "ui5.walkthrough","i18n&q…

【已解決】No module named ‘sklearn‘

問題描述 No module named ‘sklearn‘ 解決辦法 pip install scikit-learn 完結撒花 契約、包容、感恩、原則……這些成年人該有的基本精神&#xff0c;為什么我在他們身上找不到呢&#xff1f;

圖像疊加中文字體

目錄 1) 前言2) freetype下載3) Demo3.1) 下載3.2) 編譯3.3) 運行3.4) 結果3.5) 更詳細的使用見目錄中說明 4) 積少成多 1) 前言 最近在做圖片、視頻疊加文字&#xff0c;要求支持中文&#xff0c;基本原理是將圖片或視頻解碼后疊加文字&#xff0c;之后做圖片或視頻編碼即可。…

ASP.NET Core概述-微軟已經收購了mono,為什么還搞.NET Core呢

一、.NET Core概述 1、相關歷程 .NET在設計之初也是考慮像Java一樣跨平臺&#xff0c;.NET Framework是在Windows下運行的&#xff0c;大部分類是可以兼容移植到Linux下&#xff0c;但是沒有人做這個工作。 2001年米格爾為Gnome尋找桌面開發技術&#xff0c;在研究了微軟的.…