I.2 主要データベースの実践的活用法
I.2.1 NCBI データベース群の効率的利用
GenBank/RefSeq: 配列データの取得と品質管理
🧪 概念例(擬似コード)
from Bio import Entrez, SeqIO
import requests
import pandas as pd
from typing import List, Dict, Optional
import time
import logging
class NCBIDataRetriever:
"""NCBI データベースからの効率的なデータ取得"""
def __init__(self, email: str, api_key: Optional[str] = None):
"""
Args:
email: NCBI利用規約に必要なメールアドレス
api_key: API利用制限緩和のためのキー(推奨)
"""
Entrez.email = email
if api_key:
Entrez.api_key = api_key
# ログ設定
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
# レート制限設定
self.request_delay = 0.34 if api_key else 1.0 # API keyあり: 10 req/sec, なし: 3 req/sec
def search_sequences(self, query: str, database: str = "nucleotide",
max_results: int = 100, filters: Optional[Dict] = None) -> List[str]:
"""
配列の検索とID取得
Args:
query: 検索クエリ(例:"BRCA1[Gene] AND human[Organism]")
database: 検索対象データベース
max_results: 取得する最大件数
filters: フィルタ条件
Returns:
List[str]: GenBank Accession IDs
"""
try:
# フィルタ適用
if filters:
filter_terms = []
if "organism" in filters:
filter_terms.append(f"{filters['organism']}[Organism]")
if "molecular_type" in filters:
filter_terms.append(f"{filters['molecular_type']}[Properties]")
if "date_range" in filters:
filter_terms.append(f"{filters['date_range']}[Publication Date]")
if filter_terms:
query += " AND " + " AND ".join(filter_terms)
self.logger.info(f"実行クエリ: {query}")
# 検索実行
handle = Entrez.esearch(db=database, term=query, retmax=max_results)
search_results = Entrez.read(handle)
handle.close()
id_list = search_results["IdList"]
self.logger.info(f"検索結果: {len(id_list)}件")
return id_list
except Exception as e:
self.logger.error(f"検索エラー: {e}")
raise
def fetch_sequences_batch(self, id_list: List[str], database: str = "nucleotide",
batch_size: int = 100, format: str = "fasta") -> Dict[str, str]:
"""
バッチ処理による配列データ取得
Args:
id_list: GenBank IDs
database: データベース名
batch_size: バッチサイズ
format: 出力フォーマット
Returns:
Dict[str, str]: ID -> 配列データのマッピング
"""
sequences = {}
# バッチ処理
for i in range(0, len(id_list), batch_size):
batch_ids = id_list[i:i + batch_size]
try:
self.logger.info(f"バッチ {i//batch_size + 1}: {len(batch_ids)}件処理中")
# データ取得
handle = Entrez.efetch(
db=database,
id=",".join(batch_ids),
rettype=format,
retmode="text"
)
# FASTA形式の場合
if format == "fasta":
fasta_records = SeqIO.parse(handle, "fasta")
for record in fasta_records:
sequences[record.id] = str(record.seq)
else:
# 他の形式(GenBank等)
sequences.update({id: handle.read() for id in batch_ids})
handle.close()
# レート制限
time.sleep(self.request_delay)
except Exception as e:
self.logger.error(f"バッチ処理エラー (batch {i//batch_size + 1}): {e}")
continue
return sequences
def get_gene_summary(self, gene_symbol: str, organism: str = "human") -> Dict:
"""
遺伝子情報の包括的取得
Args:
gene_symbol: 遺伝子シンボル(例:BRCA1)
organism: 生物種
Returns:
Dict: 遺伝子情報
"""
try:
# Gene データベースで検索
gene_query = f"{gene_symbol}[Gene] AND {organism}[Organism]"
gene_handle = Entrez.esearch(db="gene", term=gene_query, retmax=1)
gene_results = Entrez.read(gene_handle)
gene_handle.close()
if not gene_results["IdList"]:
return {"error": f"遺伝子 {gene_symbol} が見つかりません"}
gene_id = gene_results["IdList"][0]
# 詳細情報取得
summary_handle = Entrez.esummary(db="gene", id=gene_id)
summary = Entrez.read(summary_handle)[0]
summary_handle.close()
# 関連配列情報取得
link_handle = Entrez.elink(dbfrom="gene", db="nucleotide", id=gene_id)
link_results = Entrez.read(link_handle)
link_handle.close()
nucleotide_ids = []
if link_results[0]["LinkSetDb"]:
nucleotide_ids = [link["Id"] for link in link_results[0]["LinkSetDb"][0]["Link"]]
return {
"gene_id": gene_id,
"symbol": summary.get("Name", ""),
"description": summary.get("Description", ""),
"summary": summary.get("Summary", ""),
"chromosome": summary.get("Chromosome", ""),
"map_location": summary.get("MapLocation", ""),
"gene_type": summary.get("GeneType", ""),
"associated_sequences": len(nucleotide_ids),
"nucleotide_ids": nucleotide_ids[:10] # 最初の10件のみ
}
except Exception as e:
self.logger.error(f"遺伝子情報取得エラー: {e}")
return {"error": str(e)}
# 使用例
if __name__ == "__main__":
# 初期化(実際の利用時はメールアドレスとAPIキーを設定)
retriever = NCBIDataRetriever(
email="your.email@example.com",
api_key="your_api_key_here" # オプション
)
# 1. BRCA1遺伝子の基本情報取得
brca1_info = retriever.get_gene_summary("BRCA1", "human")
print("BRCA1遺伝子情報:")
print(f"- 説明: {brca1_info.get('description', 'N/A')}")
print(f"- 染色体: {brca1_info.get('chromosome', 'N/A')}")
print(f"- 関連配列数: {brca1_info.get('associated_sequences', 'N/A')}")
# 2. COVID-19関連配列の検索と取得
covid_filters = {
"organism": "SARS-CoV-2",
"molecular_type": "genomic RNA",
"date_range": "2020/01/01:2024/12/31"
}
covid_ids = retriever.search_sequences(
query="complete genome",
database="nucleotide",
max_results=50,
filters=covid_filters
)
if covid_ids:
# バッチで配列取得
covid_sequences = retriever.fetch_sequences_batch(
covid_ids[:10], # テスト用に10件のみ
batch_size=5
)
print(f"\n取得したCOVID-19配列: {len(covid_sequences)}件")
for seq_id, sequence in list(covid_sequences.items())[:3]:
print(f"- {seq_id}: {len(sequence)} bp")
SRA: シークエンシングデータの効率的アクセス
🧪 概念例(擬似コード)
import subprocess
import os
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET
import requests
from typing import List, Dict, Optional
import concurrent.futures
import hashlib
class SRADataManager:
"""SRA(Sequence Read Archive)データの効率的管理"""
def __init__(self, work_dir: str = "./sra_data", max_workers: int = 4):
"""
Args:
work_dir: 作業ディレクトリ
max_workers: 並列ダウンロード数
"""
self.work_dir = Path(work_dir)
self.work_dir.mkdir(exist_ok=True)
self.max_workers = max_workers
# SRA Toolkitの確認
self._check_sra_toolkit()
def _check_sra_toolkit(self):
"""SRA Toolkitのインストール確認"""
try:
result = subprocess.run(["fastq-dump", "--version"],
capture_output=True, text=True)
if result.returncode == 0:
print(f"SRA Toolkit確認済み: {result.stdout.strip()}")
else:
raise FileNotFoundError
except FileNotFoundError:
print("警告: SRA Toolkitが見つかりません")
print("インストール方法: conda install -c bioconda sra-tools")
def search_sra_studies(self, query: str, max_results: int = 100) -> pd.DataFrame:
"""
SRAスタディの検索
Args:
query: 検索クエリ(例:"RNA-seq AND human AND cancer")
max_results: 最大取得件数
Returns:
pd.DataFrame: 検索結果
"""
try:
# ESearchでSRAスタディを検索
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
"db": "sra",
"term": query,
"retmax": max_results,
"retmode": "xml"
}
response = requests.get(search_url, params=search_params)
root = ET.fromstring(response.content)
# SRA IDリストを取得
sra_ids = [id_elem.text for id_elem in root.findall(".//Id")]
if not sra_ids:
print("検索結果なし")
return pd.DataFrame()
# 詳細情報を取得
summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
summary_params = {
"db": "sra",
"id": ",".join(sra_ids),
"retmode": "xml"
}
response = requests.get(summary_url, params=summary_params)
root = ET.fromstring(response.content)
# 結果をパース
studies = []
for doc_sum in root.findall(".//DocSum"):
study_info = {"SRA_ID": doc_sum.find("Id").text}
for item in doc_sum.findall(".//Item"):
name = item.get("Name")
if name in ["Title", "Platform", "Organism", "LibraryStrategy",
"LibrarySource", "SampleAccession", "StudyAccession"]:
study_info[name] = item.text or ""
studies.append(study_info)
df = pd.DataFrame(studies)
print(f"検索結果: {len(df)}件のSRAスタディを取得")
return df
except Exception as e:
print(f"SRA検索エラー: {e}")
return pd.DataFrame()
def get_run_info(self, study_accession: str) -> pd.DataFrame:
"""
スタディに含まれるランの詳細情報を取得
Args:
study_accession: SRAスタディアクセッション(例:SRP123456)
Returns:
pd.DataFrame: ラン情報
"""
try:
# RunInfoを取得
url = f"https://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={study_accession}"
response = requests.get(url)
if response.status_code == 200:
# CSVデータをDataFrameに変換
from io import StringIO
df = pd.read_csv(StringIO(response.text))
# 重要な列のみ選択
important_cols = [
"Run", "SampleName", "Experiment", "LibraryStrategy",
"LibrarySource", "Platform", "Instrument", "InsertSize",
"LibraryLayout", "spots", "bases", "download_path"
]
available_cols = [col for col in important_cols if col in df.columns]
df_filtered = df[available_cols]
print(f"ラン情報取得: {len(df_filtered)}件")
return df_filtered
else:
print(f"RunInfo取得失敗: HTTP {response.status_code}")
return pd.DataFrame()
except Exception as e:
print(f"RunInfo取得エラー: {e}")
return pd.DataFrame()
def download_fastq(self, run_accession: str, output_dir: Optional[str] = None,
paired: bool = True, compressed: bool = True) -> Dict[str, str]:
"""
FASTQファイルのダウンロード
Args:
run_accession: ランアクセッション(例:SRR123456)
output_dir: 出力ディレクトリ
paired: ペアエンドデータの場合True
compressed: 圧縮ファイルで保存する場合True
Returns:
Dict[str, str]: ダウンロードしたファイルのパス
"""
if output_dir is None:
output_dir = self.work_dir / "fastq"
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
try:
# fastq-dumpコマンド構築
cmd = ["fastq-dump"]
if paired:
cmd.extend(["--split-files"]) # ペアエンドファイルを分割
if compressed:
cmd.extend(["--gzip"]) # 圧縮して保存
# 出力ディレクトリ指定
cmd.extend(["--outdir", str(output_dir)])
# ランアクセッション
cmd.append(run_accession)
print(f"FASTQダウンロード開始: {run_accession}")
print(f"コマンド: {' '.join(cmd)}")
# 実行
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
# ダウンロードされたファイルを確認
downloaded_files = {}
suffix = ".fastq.gz" if compressed else ".fastq"
if paired:
# ペアエンドファイル
for i in [1, 2]:
filename = f"{run_accession}_{i}{suffix}"
filepath = output_dir / filename
if filepath.exists():
downloaded_files[f"read_{i}"] = str(filepath)
else:
# シングルエンドファイル
filename = f"{run_accession}{suffix}"
filepath = output_dir / filename
if filepath.exists():
downloaded_files["reads"] = str(filepath)
print(f"ダウンロード完了: {len(downloaded_files)}ファイル")
return downloaded_files
else:
print(f"ダウンロードエラー: {result.stderr}")
return {}
except Exception as e:
print(f"FASTQダウンロードエラー: {e}")
return {}
def batch_download(self, run_list: List[str], max_concurrent: int = None) -> Dict[str, Dict]:
"""
複数ランの並列ダウンロード
Args:
run_list: ランアクセッションのリスト
max_concurrent: 最大並列数
Returns:
Dict[str, Dict]: ラン別ダウンロード結果
"""
if max_concurrent is None:
max_concurrent = self.max_workers
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:
# 並列ダウンロード開始
future_to_run = {
executor.submit(self.download_fastq, run): run
for run in run_list
}
for future in concurrent.futures.as_completed(future_to_run):
run = future_to_run[future]
try:
result = future.result()
results[run] = result
print(f"完了: {run}")
except Exception as e:
print(f"エラー {run}: {e}")
results[run] = {"error": str(e)}
return results
def verify_download_integrity(self, file_path: str, expected_md5: str = None) -> bool:
"""
ダウンロードファイルの整合性確認
Args:
file_path: ファイルパス
expected_md5: 期待されるMD5ハッシュ
Returns:
bool: 整合性確認結果
"""
if not os.path.exists(file_path):
return False
if expected_md5:
# MD5ハッシュ計算
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
actual_md5 = hasher.hexdigest()
return actual_md5 == expected_md5
# MD5が提供されていない場合はファイル存在確認のみ
return True
# 使用例
if __name__ == "__main__":
# SRAデータマネージャー初期化
sra_manager = SRADataManager(work_dir="./sra_analysis", max_workers=2)
# 1. COVID-19関連RNA-seqデータの検索
covid_studies = sra_manager.search_sra_studies(
query="COVID-19 AND RNA-seq AND human",
max_results=20
)
if not covid_studies.empty:
print("\n検索結果(最初の5件):")
print(covid_studies.head()[["StudyAccession", "Title", "Platform"]].to_string())
# 2. 特定のスタディのラン情報取得
first_study = covid_studies.iloc[0]["StudyAccession"]
run_info = sra_manager.get_run_info(first_study)
if not run_info.empty:
print(f"\n{first_study}のラン情報:")
print(run_info.head()[["Run", "LibraryStrategy", "spots", "bases"]].to_string())
# 3. 小さなサンプルをダウンロード(テスト用)
test_runs = run_info.head(2)["Run"].tolist()
print(f"\nテストダウンロード開始: {test_runs}")
# 個別ダウンロード
for run in test_runs:
result = sra_manager.download_fastq(run, compressed=True)
if result:
print(f"ダウンロード成功 {run}: {list(result.keys())}")
else:
print(f"ダウンロード失敗: {run}")
I.2.2 UniProt データベースの高度な活用
🧪 概念例(擬似コード)
import requests
import pandas as pd
import json
from typing import List, Dict, Optional, Union
import time
import re
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
class UniProtAnalyzer:
"""UniProtタンパク質データベースの包括的解析ツール"""
def __init__(self, rate_limit: float = 1.0):
"""
Args:
rate_limit: リクエスト間の待機時間(秒)
"""
self.base_url = "https://rest.uniprot.org"
self.rate_limit = rate_limit
self.session = requests.Session()
# よく使用されるフィールド定義
self.common_fields = {
"basic": [
"accession", "id", "gene_names", "protein_name",
"organism_name", "length", "mass"
],
"sequence": [
"accession", "sequence", "length", "mass",
"cc_subcellular_location", "ft_domain"
],
"function": [
"accession", "protein_name", "cc_function",
"go_c", "go_f", "go_p", "cc_pathway"
],
"disease": [
"accession", "gene_names", "cc_disease",
"cc_involvement_in_disease", "cc_allergen", "cc_toxic_dose"
],
"structure": [
"accession", "ft_helix", "ft_strand", "ft_turn",
"ft_disulfid", "xref_pdb", "cc_similarity"
]
}
def search_proteins(self, query: str, organism: Optional[str] = None,
reviewed: Optional[bool] = None, max_results: int = 100,
fields: List[str] = None) -> pd.DataFrame:
"""
タンパク質の検索
Args:
query: 検索クエリ(Gene name, protein name, keywords等)
organism: 生物種(例:"human", "mouse", "9606")
reviewed: Swiss-Protのみ検索する場合True
max_results: 最大取得件数
fields: 取得するフィールドのリスト
Returns:
pd.DataFrame: 検索結果
"""
# クエリ構築
search_terms = [query]
if organism:
if organism.isdigit():
search_terms.append(f"taxonomy_id:{organism}")
else:
search_terms.append(f"organism:{organism}")
if reviewed is not None:
search_terms.append("reviewed:true" if reviewed else "reviewed:false")
final_query = " AND ".join(search_terms)
# フィールド設定
if fields is None:
fields = self.common_fields["basic"]
# APIリクエスト
params = {
"query": final_query,
"format": "tsv",
"fields": ",".join(fields),
"size": min(max_results, 500) # API制限
}
try:
response = self.session.get(f"{self.base_url}/uniprotkb/search", params=params)
response.raise_for_status()
# TSVデータをDataFrameに変換
from io import StringIO
df = pd.read_csv(StringIO(response.text), sep='\t')
print(f"検索結果: {len(df)}件のタンパク質")
return df
except requests.exceptions.RequestException as e:
print(f"UniProt検索エラー: {e}")
return pd.DataFrame()
finally:
time.sleep(self.rate_limit)
def get_protein_details(self, accession: str,
include_features: bool = True,
include_interactions: bool = True) -> Dict:
"""
特定タンパク質の詳細情報取得
Args:
accession: UniProtアクセッション
include_features: 特徴情報を含める場合True
include_interactions: 相互作用情報を含める場合True
Returns:
Dict: タンパク質詳細情報
"""
try:
# 基本情報取得
response = self.session.get(f"{self.base_url}/uniprotkb/{accession}")
response.raise_for_status()
protein_data = response.json()
# 構造化された情報を抽出
details = {
"accession": accession,
"entry_name": protein_data.get("uniProtkbId", ""),
"protein_names": self._extract_protein_names(protein_data),
"gene_names": self._extract_gene_names(protein_data),
"organism": self._extract_organism(protein_data),
"sequence_info": self._extract_sequence_info(protein_data),
"subcellular_location": self._extract_subcellular_location(protein_data),
"function": self._extract_function(protein_data),
"go_annotations": self._extract_go_annotations(protein_data),
"pathways": self._extract_pathways(protein_data),
"diseases": self._extract_diseases(protein_data)
}
# オプション情報
if include_features:
details["features"] = self._extract_features(protein_data)
if include_interactions:
details["interactions"] = self.get_protein_interactions(accession)
return details
except requests.exceptions.RequestException as e:
print(f"タンパク質詳細取得エラー: {e}")
return {}
finally:
time.sleep(self.rate_limit)
def _extract_protein_names(self, data: Dict) -> Dict:
"""タンパク質名の抽出"""
names = {"recommended": "", "alternative": [], "short": []}
if "proteinDescription" in data:
desc = data["proteinDescription"]
if "recommendedName" in desc:
names["recommended"] = desc["recommendedName"].get("fullName", {}).get("value", "")
if "alternativeNames" in desc:
for alt in desc["alternativeNames"]:
if "fullName" in alt:
names["alternative"].append(alt["fullName"].get("value", ""))
if "shortNames" in alt:
names["short"].extend([sn.get("value", "") for sn in alt["shortNames"]])
return names
def _extract_gene_names(self, data: Dict) -> Dict:
"""遺伝子名の抽出"""
genes = {"primary": "", "synonyms": [], "ordered_locus": [], "orf": []}
if "genes" in data:
for gene in data["genes"]:
if gene.get("geneName"):
genes["primary"] = gene["geneName"].get("value", "")
if "synonyms" in gene:
genes["synonyms"].extend([syn.get("value", "") for syn in gene["synonyms"]])
if "orderedLocusNames" in gene:
genes["ordered_locus"].extend([oln.get("value", "") for oln in gene["orderedLocusNames"]])
if "orfNames" in gene:
genes["orf"].extend([orf.get("value", "") for orf in gene["orfNames"]])
return genes
def _extract_organism(self, data: Dict) -> Dict:
"""生物種情報の抽出"""
organism = {"scientific_name": "", "common_name": "", "taxonomy_id": 0}
if "organism" in data:
org = data["organism"]
organism["scientific_name"] = org.get("scientificName", "")
organism["common_name"] = org.get("commonName", "")
organism["taxonomy_id"] = org.get("taxonId", 0)
return organism
def _extract_sequence_info(self, data: Dict) -> Dict:
"""配列情報の抽出"""
seq_info = {"length": 0, "mass": 0, "checksum": "", "sequence": ""}
if "sequence" in data:
seq = data["sequence"]
seq_info["length"] = seq.get("length", 0)
seq_info["mass"] = seq.get("molWeight", 0)
seq_info["checksum"] = seq.get("crc64", "")
seq_info["sequence"] = seq.get("value", "")
return seq_info
def _extract_go_annotations(self, data: Dict) -> Dict:
"""Gene Ontologyアノテーションの抽出"""
go_terms = {"molecular_function": [], "biological_process": [], "cellular_component": []}
if "uniProtKBCrossReferences" in data:
for ref in data["uniProtKBCrossReferences"]:
if ref.get("database") == "GO":
go_id = ref.get("id", "")
go_desc = ""
go_aspect = ""
if "properties" in ref:
for prop in ref["properties"]:
if prop.get("key") == "GoTerm":
go_desc = prop.get("value", "").split(":")[1] if ":" in prop.get("value", "") else ""
elif prop.get("key") == "GoEvidenceType":
go_aspect = prop.get("value", "")
go_entry = {"id": go_id, "term": go_desc, "evidence": go_aspect}
# GO aspectに基づく分類
if go_id.startswith("GO:"):
if "F:" in go_desc:
go_terms["molecular_function"].append(go_entry)
elif "P:" in go_desc:
go_terms["biological_process"].append(go_entry)
elif "C:" in go_desc:
go_terms["cellular_component"].append(go_entry)
return go_terms
def get_protein_interactions(self, accession: str, max_interactions: int = 50) -> List[Dict]:
"""
タンパク質相互作用情報の取得
Args:
accession: UniProtアクセッション
max_interactions: 最大取得相互作用数
Returns:
List[Dict]: 相互作用情報
"""
try:
# STRING データベースAPIを使用
string_url = "https://string-db.org/api/json/network"
params = {
"identifiers": accession,
"species": 9606, # human
"limit": max_interactions
}
response = self.session.get(string_url, params=params)
response.raise_for_status()
interactions = []
for interaction in response.json():
interactions.append({
"partner_a": interaction.get("preferredName_A", ""),
"partner_b": interaction.get("preferredName_B", ""),
"score": interaction.get("score", 0),
"interaction_type": "protein-protein"
})
return interactions
except requests.exceptions.RequestException as e:
print(f"相互作用情報取得エラー: {e}")
return []
finally:
time.sleep(self.rate_limit)
def analyze_protein_family(self, gene_family: str, organisms: List[str] = None) -> pd.DataFrame:
"""
タンパク質ファミリーの比較解析
Args:
gene_family: 遺伝子ファミリー名(例:"histone", "kinase")
organisms: 対象生物種のリスト
Returns:
pd.DataFrame: ファミリー解析結果
"""
if organisms is None:
organisms = ["human", "mouse", "rat", "zebrafish"]
family_data = []
for organism in organisms:
print(f"{organism}で{gene_family}ファミリーを検索中...")
# ファミリー検索
results = self.search_proteins(
query=gene_family,
organism=organism,
reviewed=True,
max_results=100,
fields=self.common_fields["basic"] + ["cc_function", "go_f"]
)
if not results.empty:
results["organism"] = organism
results["family"] = gene_family
family_data.append(results)
time.sleep(self.rate_limit)
if family_data:
combined_df = pd.concat(family_data, ignore_index=True)
# ファミリー統計
print(f"\n{gene_family}ファミリー解析結果:")
print(f"- 総タンパク質数: {len(combined_df)}")
print(f"- 生物種別分布:")
print(combined_df["organism"].value_counts().to_string())
return combined_df
else:
return pd.DataFrame()
def functional_enrichment_analysis(self, protein_list: List[str]) -> Dict:
"""
タンパク質リストの機能的濃縮解析
Args:
protein_list: UniProtアクセッションのリスト
Returns:
Dict: 濃縮解析結果
"""
# タンパク質の詳細情報を取得
proteins_data = []
for accession in protein_list[:20]: # 制限
details = self.get_protein_details(accession, include_features=False)
if details:
proteins_data.append(details)
time.sleep(self.rate_limit)
if not proteins_data:
return {}
# GO term集計
go_counts = {"molecular_function": {}, "biological_process": {}, "cellular_component": {}}
for protein in proteins_data:
go_annotations = protein.get("go_annotations", {})
for category, terms in go_annotations.items():
for term in terms:
term_id = term.get("term", "Unknown")
if term_id in go_counts[category]:
go_counts[category][term_id] += 1
else:
go_counts[category][term_id] = 1
# パスウェイ集計
pathway_counts = {}
for protein in proteins_data:
pathways = protein.get("pathways", [])
for pathway in pathways:
if pathway in pathway_counts:
pathway_counts[pathway] += 1
else:
pathway_counts[pathway] = 1
# 結果をランク順にソート
enrichment_results = {
"go_molecular_function": sorted(go_counts["molecular_function"].items(),
key=lambda x: x[1], reverse=True)[:10],
"go_biological_process": sorted(go_counts["biological_process"].items(),
key=lambda x: x[1], reverse=True)[:10],
"go_cellular_component": sorted(go_counts["cellular_component"].items(),
key=lambda x: x[1], reverse=True)[:10],
"pathways": sorted(pathway_counts.items(), key=lambda x: x[1], reverse=True)[:10],
"total_proteins": len(proteins_data)
}
return enrichment_results
# 使用例
if __name__ == "__main__":
# UniProt解析ツール初期化
uniprot = UniProtAnalyzer(rate_limit=1.0)
# 1. BRCA1タンパク質の詳細解析
print("=== BRCA1タンパク質詳細解析 ===")
brca1_details = uniprot.get_protein_details("P38398",
include_features=True,
include_interactions=True)
if brca1_details:
print(f"タンパク質名: {brca1_details['protein_names']['recommended']}")
print(f"遺伝子名: {brca1_details['gene_names']['primary']}")
print(f"配列長: {brca1_details['sequence_info']['length']} aa")
print(f"分子量: {brca1_details['sequence_info']['mass']} Da")
print(f"相互作用数: {len(brca1_details.get('interactions', []))}")
# 2. キナーゼファミリーの比較解析
print("\n=== キナーゼファミリー比較解析 ===")
kinase_family = uniprot.analyze_protein_family(
gene_family="protein kinase",
organisms=["human", "mouse"]
)
if not kinase_family.empty:
print("\n上位10タンパク質:")
print(kinase_family.head(10)[["Entry", "Gene Names", "Protein names", "organism"]].to_string())
# 3. がん関連タンパク質の機能的濃縮解析
print("\n=== がん関連タンパク質の機能的濃縮解析 ===")
cancer_proteins = ["P53_HUMAN", "P38398", "P04637", "P21359", "Q02952"] # 例
# アクセッション形式に変換
cancer_search = uniprot.search_proteins("cancer AND tumor suppressor",
organism="human",
reviewed=True,
max_results=20)
if not cancer_search.empty:
cancer_accessions = cancer_search["Entry"].tolist()[:10]
enrichment = uniprot.functional_enrichment_analysis(cancer_accessions)
print(f"解析対象タンパク質数: {enrichment.get('total_proteins', 0)}")
print("\n上位GO Molecular Function:")
for term, count in enrichment.get("go_molecular_function", [])[:5]:
print(f" {term}: {count}件")
print("\n上位GO Biological Process:")
for term, count in enrichment.get("go_biological_process", [])[:5]:
print(f" {term}: {count}件")