I.2 主要データベースの実践的活用法
I.2.1 NCBI データベース群の効率的利用
GenBank/RefSeq: 配列データの取得と品質管理
from Bio import Entrez, SeqIO
import requests
import pandas as pd
from typing import List, Dict, Optional
import time
import logging
class NCBIDataRetriever:
"""NCBI データベースからの効率的なデータ取得"""
def __init__(self, email: str, api_key: Optional[str] = None):
"""
Args:
email: NCBI利用規約に必要なメールアドレス
api_key: API利用制限緩和のためのキー(推奨)
"""
Entrez.email = email
if api_key:
Entrez.api_key = api_key
# ログ設定
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
# レート制限設定
self.request_delay = 0.34 if api_key else 1.0 # API keyあり: 10 req/sec, なし: 3 req/sec
def search_sequences(self, query: str, database: str = "nucleotide",
max_results: int = 100, filters: Optional[Dict] = None) -> List[str]:
"""
配列の検索とID取得
Args:
query: 検索クエリ(例:"BRCA1[Gene] AND human[Organism]")
database: 検索対象データベース
max_results: 取得する最大件数
filters: フィルタ条件
Returns:
List[str]: GenBank Accession IDs
"""
try:
# フィルタ適用
if filters:
filter_terms = []
if "organism" in filters:
filter_terms.append(f"{filters['organism']}[Organism]")
if "molecular_type" in filters:
filter_terms.append(f"{filters['molecular_type']}[Properties]")
if "date_range" in filters:
filter_terms.append(f"{filters['date_range']}[Publication Date]")
if filter_terms:
query += " AND " + " AND ".join(filter_terms)
self.logger.info(f"実行クエリ: {query}")
# 検索実行
handle = Entrez.esearch(db=database, term=query, retmax=max_results)
search_results = Entrez.read(handle)
handle.close()
id_list = search_results["IdList"]
self.logger.info(f"検索結果: {len(id_list)}件")
return id_list
except Exception as e:
self.logger.error(f"検索エラー: {e}")
raise
def fetch_sequences_batch(self, id_list: List[str], database: str = "nucleotide",
batch_size: int = 100, format: str = "fasta") -> Dict[str, str]:
"""
バッチ処理による配列データ取得
Args:
id_list: GenBank IDs
database: データベース名
batch_size: バッチサイズ
format: 出力フォーマット
Returns:
Dict[str, str]: ID -> 配列データのマッピング
"""
sequences = {}
# バッチ処理
for i in range(0, len(id_list), batch_size):
batch_ids = id_list[i:i + batch_size]
try:
self.logger.info(f"バッチ {i//batch_size + 1}: {len(batch_ids)}件処理中")
# データ取得
handle = Entrez.efetch(
db=database,
id=",".join(batch_ids),
rettype=format,
retmode="text"
)
# FASTA形式の場合
if format == "fasta":
fasta_records = SeqIO.parse(handle, "fasta")
for record in fasta_records:
sequences[record.id] = str(record.seq)
else:
# 他の形式(GenBank等)
sequences.update({id: handle.read() for id in batch_ids})
handle.close()
# レート制限
time.sleep(self.request_delay)
except Exception as e:
self.logger.error(f"バッチ処理エラー (batch {i//batch_size + 1}): {e}")
continue
return sequences
def get_gene_summary(self, gene_symbol: str, organism: str = "human") -> Dict:
"""
遺伝子情報の包括的取得
Args:
gene_symbol: 遺伝子シンボル(例:BRCA1)
organism: 生物種
Returns:
Dict: 遺伝子情報
"""
try:
# Gene データベースで検索
gene_query = f"{gene_symbol}[Gene] AND {organism}[Organism]"
gene_handle = Entrez.esearch(db="gene", term=gene_query, retmax=1)
gene_results = Entrez.read(gene_handle)
gene_handle.close()
if not gene_results["IdList"]:
return {"error": f"遺伝子 {gene_symbol} が見つかりません"}
gene_id = gene_results["IdList"][0]
# 詳細情報取得
summary_handle = Entrez.esummary(db="gene", id=gene_id)
summary = Entrez.read(summary_handle)[0]
summary_handle.close()
# 関連配列情報取得
link_handle = Entrez.elink(dbfrom="gene", db="nucleotide", id=gene_id)
link_results = Entrez.read(link_handle)
link_handle.close()
nucleotide_ids = []
if link_results[0]["LinkSetDb"]:
nucleotide_ids = [link["Id"] for link in link_results[0]["LinkSetDb"][0]["Link"]]
return {
"gene_id": gene_id,
"symbol": summary.get("Name", ""),
"description": summary.get("Description", ""),
"summary": summary.get("Summary", ""),
"chromosome": summary.get("Chromosome", ""),
"map_location": summary.get("MapLocation", ""),
"gene_type": summary.get("GeneType", ""),
"associated_sequences": len(nucleotide_ids),
"nucleotide_ids": nucleotide_ids[:10] # 最初の10件のみ
}
except Exception as e:
self.logger.error(f"遺伝子情報取得エラー: {e}")
return {"error": str(e)}
# 使用例
if __name__ == "__main__":
# 初期化(実際の利用時はメールアドレスとAPIキーを設定)
retriever = NCBIDataRetriever(
email="your.email@example.com",
api_key="your_api_key_here" # オプション
)
# 1. BRCA1遺伝子の基本情報取得
brca1_info = retriever.get_gene_summary("BRCA1", "human")
print("BRCA1遺伝子情報:")
print(f"- 説明: {brca1_info.get('description', 'N/A')}")
print(f"- 染色体: {brca1_info.get('chromosome', 'N/A')}")
print(f"- 関連配列数: {brca1_info.get('associated_sequences', 'N/A')}")
# 2. COVID-19関連配列の検索と取得
covid_filters = {
"organism": "SARS-CoV-2",
"molecular_type": "genomic RNA",
"date_range": "2020/01/01:2024/12/31"
}
covid_ids = retriever.search_sequences(
query="complete genome",
database="nucleotide",
max_results=50,
filters=covid_filters
)
if covid_ids:
# バッチで配列取得
covid_sequences = retriever.fetch_sequences_batch(
covid_ids[:10], # テスト用に10件のみ
batch_size=5
)
print(f"\n取得したCOVID-19配列: {len(covid_sequences)}件")
for seq_id, sequence in list(covid_sequences.items())[:3]:
print(f"- {seq_id}: {len(sequence)} bp")
SRA: シークエンシングデータの効率的アクセス
import subprocess
import os
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET
import requests
from typing import List, Dict, Optional
import concurrent.futures
import hashlib
class SRADataManager:
"""SRA(Sequence Read Archive)データの効率的管理"""
def __init__(self, work_dir: str = "./sra_data", max_workers: int = 4):
"""
Args:
work_dir: 作業ディレクトリ
max_workers: 並列ダウンロード数
"""
self.work_dir = Path(work_dir)
self.work_dir.mkdir(exist_ok=True)
self.max_workers = max_workers
# SRA Toolkitの確認
self._check_sra_toolkit()
def _check_sra_toolkit(self):
"""SRA Toolkitのインストール確認"""
try:
result = subprocess.run(["fastq-dump", "--version"],
capture_output=True, text=True)
if result.returncode == 0:
print(f"SRA Toolkit確認済み: {result.stdout.strip()}")
else:
raise FileNotFoundError
except FileNotFoundError:
print("警告: SRA Toolkitが見つかりません")
print("インストール方法: conda install -c bioconda sra-tools")
def search_sra_studies(self, query: str, max_results: int = 100) -> pd.DataFrame:
"""
SRAスタディの検索
Args:
query: 検索クエリ(例:"RNA-seq AND human AND cancer")
max_results: 最大取得件数
Returns:
pd.DataFrame: 検索結果
"""
try:
# ESearchでSRAスタディを検索
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
"db": "sra",
"term": query,
"retmax": max_results,
"retmode": "xml"
}
response = requests.get(search_url, params=search_params)
root = ET.fromstring(response.content)
# SRA IDリストを取得
sra_ids = [id_elem.text for id_elem in root.findall(".//Id")]
if not sra_ids:
print("検索結果なし")
return pd.DataFrame()
# 詳細情報を取得
summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
summary_params = {
"db": "sra",
"id": ",".join(sra_ids),
"retmode": "xml"
}
response = requests.get(summary_url, params=summary_params)
root = ET.fromstring(response.content)
# 結果をパース
studies = []
for doc_sum in root.findall(".//DocSum"):
study_info = {"SRA_ID": doc_sum.find("Id").text}
for item in doc_sum.findall(".//Item"):
name = item.get("Name")
if name in ["Title", "Platform", "Organism", "LibraryStrategy",
"LibrarySource", "SampleAccession", "StudyAccession"]:
study_info[name] = item.text or ""
studies.append(study_info)
df = pd.DataFrame(studies)
print(f"検索結果: {len(df)}件のSRAスタディを取得")
return df
except Exception as e:
print(f"SRA検索エラー: {e}")
return pd.DataFrame()
def get_run_info(self, study_accession: str) -> pd.DataFrame:
"""
スタディに含まれるランの詳細情報を取得
Args:
study_accession: SRAスタディアクセッション(例:SRP123456)
Returns:
pd.DataFrame: ラン情報
"""
try:
# RunInfoを取得
url = f"https://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={study_accession}"
response = requests.get(url)
if response.status_code == 200:
# CSVデータをDataFrameに変換
from io import StringIO
df = pd.read_csv(StringIO(response.text))
# 重要な列のみ選択
important_cols = [
"Run", "SampleName", "Experiment", "LibraryStrategy",
"LibrarySource", "Platform", "Instrument", "InsertSize",
"LibraryLayout", "spots", "bases", "download_path"
]
available_cols = [col for col in important_cols if col in df.columns]
df_filtered = df[available_cols]
print(f"ラン情報取得: {len(df_filtered)}件")
return df_filtered
else:
print(f"RunInfo取得失敗: HTTP {response.status_code}")
return pd.DataFrame()
except Exception as e:
print(f"RunInfo取得エラー: {e}")
return pd.DataFrame()
def download_fastq(self, run_accession: str, output_dir: Optional[str] = None,
paired: bool = True, compressed: bool = True) -> Dict[str, str]:
"""
FASTQファイルのダウンロード
Args:
run_accession: ランアクセッション(例:SRR123456)
output_dir: 出力ディレクトリ
paired: ペアエンドデータの場合True
compressed: 圧縮ファイルで保存する場合True
Returns:
Dict[str, str]: ダウンロードしたファイルのパス
"""
if output_dir is None:
output_dir = self.work_dir / "fastq"
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
try:
# fastq-dumpコマンド構築
cmd = ["fastq-dump"]
if paired:
cmd.extend(["--split-files"]) # ペアエンドファイルを分割
if compressed:
cmd.extend(["--gzip"]) # 圧縮して保存
# 出力ディレクトリ指定
cmd.extend(["--outdir", str(output_dir)])
# ランアクセッション
cmd.append(run_accession)
print(f"FASTQダウンロード開始: {run_accession}")
print(f"コマンド: {' '.join(cmd)}")
# 実行
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
# ダウンロードされたファイルを確認
downloaded_files = {}
suffix = ".fastq.gz" if compressed else ".fastq"
if paired:
# ペアエンドファイル
for i in [1, 2]:
filename = f"{run_accession}_{i}{suffix}"
filepath = output_dir / filename
if filepath.exists():
downloaded_files[f"read_{i}"] = str(filepath)
else:
# シングルエンドファイル
filename = f"{run_accession}{suffix}"
filepath = output_dir / filename
if filepath.exists():
downloaded_files["reads"] = str(filepath)
print(f"ダウンロード完了: {len(downloaded_files)}ファイル")
return downloaded_files
else:
print(f"ダウンロードエラー: {result.stderr}")
return {}
except Exception as e:
print(f"FASTQダウンロードエラー: {e}")
return {}
def batch_download(self, run_list: List[str], max_concurrent: int = None) -> Dict[str, Dict]:
"""
複数ランの並列ダウンロード
Args:
run_list: ランアクセッションのリスト
max_concurrent: 最大並列数
Returns:
Dict[str, Dict]: ラン別ダウンロード結果
"""
if max_concurrent is None:
max_concurrent = self.max_workers
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:
# 並列ダウンロード開始
future_to_run = {
executor.submit(self.download_fastq, run): run
for run in run_list
}
for future in concurrent.futures.as_completed(future_to_run):
run = future_to_run[future]
try:
result = future.result()
results[run] = result
print(f"完了: {run}")
except Exception as e:
print(f"エラー {run}: {e}")
results[run] = {"error": str(e)}
return results
def verify_download_integrity(self, file_path: str, expected_md5: str = None) -> bool:
"""
ダウンロードファイルの整合性確認
Args:
file_path: ファイルパス
expected_md5: 期待されるMD5ハッシュ
Returns:
bool: 整合性確認結果
"""
if not os.path.exists(file_path):
return False
if expected_md5:
# MD5ハッシュ計算
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
actual_md5 = hasher.hexdigest()
return actual_md5 == expected_md5
# MD5が提供されていない場合はファイル存在確認のみ
return True
# 使用例
if __name__ == "__main__":
# SRAデータマネージャー初期化
sra_manager = SRADataManager(work_dir="./sra_analysis", max_workers=2)
# 1. COVID-19関連RNA-seqデータの検索
covid_studies = sra_manager.search_sra_studies(
query="COVID-19 AND RNA-seq AND human",
max_results=20
)
if not covid_studies.empty:
print("\n検索結果(最初の5件):")
print(covid_studies.head()[["StudyAccession", "Title", "Platform"]].to_string())
# 2. 特定のスタディのラン情報取得
first_study = covid_studies.iloc[0]["StudyAccession"]
run_info = sra_manager.get_run_info(first_study)
if not run_info.empty:
print(f"\n{first_study}のラン情報:")
print(run_info.head()[["Run", "LibraryStrategy", "spots", "bases"]].to_string())
# 3. 小さなサンプルをダウンロード(テスト用)
test_runs = run_info.head(2)["Run"].tolist()
print(f"\nテストダウンロード開始: {test_runs}")
# 個別ダウンロード
for run in test_runs:
result = sra_manager.download_fastq(run, compressed=True)
if result:
print(f"ダウンロード成功 {run}: {list(result.keys())}")
else:
print(f"ダウンロード失敗: {run}")
I.2.2 UniProt データベースの高度な活用
import requests
import pandas as pd
import json
from typing import List, Dict, Optional, Union
import time
import re
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
class UniProtAnalyzer:
"""UniProtタンパク質データベースの包括的解析ツール"""
def __init__(self, rate_limit: float = 1.0):
"""
Args:
rate_limit: リクエスト間の待機時間(秒)
"""
self.base_url = "https://rest.uniprot.org"
self.rate_limit = rate_limit
self.session = requests.Session()
# よく使用されるフィールド定義
self.common_fields = {
"basic": [
"accession", "id", "gene_names", "protein_name",
"organism_name", "length", "mass"
],
"sequence": [
"accession", "sequence", "length", "mass",
"cc_subcellular_location", "ft_domain"
],
"function": [
"accession", "protein_name", "cc_function",
"go_c", "go_f", "go_p", "cc_pathway"
],
"disease": [
"accession", "gene_names", "cc_disease",
"cc_involvement_in_disease", "cc_allergen", "cc_toxic_dose"
],
"structure": [
"accession", "ft_helix", "ft_strand", "ft_turn",
"ft_disulfid", "xref_pdb", "cc_similarity"
]
}
def search_proteins(self, query: str, organism: Optional[str] = None,
reviewed: Optional[bool] = None, max_results: int = 100,
fields: List[str] = None) -> pd.DataFrame:
"""
タンパク質の検索
Args:
query: 検索クエリ(Gene name, protein name, keywords等)
organism: 生物種(例:"human", "mouse", "9606")
reviewed: Swiss-Protのみ検索する場合True
max_results: 最大取得件数
fields: 取得するフィールドのリスト
Returns:
pd.DataFrame: 検索結果
"""
# クエリ構築
search_terms = [query]
if organism:
if organism.isdigit():
search_terms.append(f"taxonomy_id:{organism}")
else:
search_terms.append(f"organism:{organism}")
if reviewed is not None:
search_terms.append("reviewed:true" if reviewed else "reviewed:false")
final_query = " AND ".join(search_terms)
# フィールド設定
if fields is None:
fields = self.common_fields["basic"]
# APIリクエスト
params = {
"query": final_query,
"format": "tsv",
"fields": ",".join(fields),
"size": min(max_results, 500) # API制限
}
try:
response = self.session.get(f"{self.base_url}/uniprotkb/search", params=params)
response.raise_for_status()
# TSVデータをDataFrameに変換
from io import StringIO
df = pd.read_csv(StringIO(response.text), sep='\t')
print(f"検索結果: {len(df)}件のタンパク質")
return df
except requests.exceptions.RequestException as e:
print(f"UniProt検索エラー: {e}")
return pd.DataFrame()
finally:
time.sleep(self.rate_limit)
def get_protein_details(self, accession: str,
include_features: bool = True,
include_interactions: bool = True) -> Dict:
"""
特定タンパク質の詳細情報取得
Args:
accession: UniProtアクセッション
include_features: 特徴情報を含める場合True
include_interactions: 相互作用情報を含める場合True
Returns:
Dict: タンパク質詳細情報
"""
try:
# 基本情報取得
response = self.session.get(f"{self.base_url}/uniprotkb/{accession}")
response.raise_for_status()
protein_data = response.json()
# 構造化された情報を抽出
details = {
"accession": accession,
"entry_name": protein_data.get("uniProtkbId", ""),
"protein_names": self._extract_protein_names(protein_data),
"gene_names": self._extract_gene_names(protein_data),
"organism": self._extract_organism(protein_data),
"sequence_info": self._extract_sequence_info(protein_data),
"subcellular_location": self._extract_subcellular_location(protein_data),
"function": self._extract_function(protein_data),
"go_annotations": self._extract_go_annotations(protein_data),
"pathways": self._extract_pathways(protein_data),
"diseases": self._extract_diseases(protein_data)
}
# オプション情報
if include_features:
details["features"] = self._extract_features(protein_data)
if include_interactions:
details["interactions"] = self.get_protein_interactions(accession)
return details
except requests.exceptions.RequestException as e:
print(f"タンパク質詳細取得エラー: {e}")
return {}
finally:
time.sleep(self.rate_limit)
def _extract_protein_names(self, data: Dict) -> Dict:
"""タンパク質名の抽出"""
names = {"recommended": "", "alternative": [], "short": []}
if "proteinDescription" in data:
desc = data["proteinDescription"]
if "recommendedName" in desc:
names["recommended"] = desc["recommendedName"].get("fullName", {}).get("value", "")
if "alternativeNames" in desc:
for alt in desc["alternativeNames"]:
if "fullName" in alt:
names["alternative"].append(alt["fullName"].get("value", ""))
if "shortNames" in alt:
names["short"].extend([sn.get("value", "") for sn in alt["shortNames"]])
return names
def _extract_gene_names(self, data: Dict) -> Dict:
"""遺伝子名の抽出"""
genes = {"primary": "", "synonyms": [], "ordered_locus": [], "orf": []}
if "genes" in data:
for gene in data["genes"]:
if gene.get("geneName"):
genes["primary"] = gene["geneName"].get("value", "")
if "synonyms" in gene:
genes["synonyms"].extend([syn.get("value", "") for syn in gene["synonyms"]])
if "orderedLocusNames" in gene:
genes["ordered_locus"].extend([oln.get("value", "") for oln in gene["orderedLocusNames"]])
if "orfNames" in gene:
genes["orf"].extend([orf.get("value", "") for orf in gene["orfNames"]])
return genes
def _extract_organism(self, data: Dict) -> Dict:
"""生物種情報の抽出"""
organism = {"scientific_name": "", "common_name": "", "taxonomy_id": 0}
if "organism" in data:
org = data["organism"]
organism["scientific_name"] = org.get("scientificName", "")
organism["common_name"] = org.get("commonName", "")
organism["taxonomy_id"] = org.get("taxonId", 0)
return organism
def _extract_sequence_info(self, data: Dict) -> Dict:
"""配列情報の抽出"""
seq_info = {"length": 0, "mass": 0, "checksum": "", "sequence": ""}
if "sequence" in data:
seq = data["sequence"]
seq_info["length"] = seq.get("length", 0)
seq_info["mass"] = seq.get("molWeight", 0)
seq_info["checksum"] = seq.get("crc64", "")
seq_info["sequence"] = seq.get("value", "")
return seq_info
def _extract_go_annotations(self, data: Dict) -> Dict:
"""Gene Ontologyアノテーションの抽出"""
go_terms = {"molecular_function": [], "biological_process": [], "cellular_component": []}
if "uniProtKBCrossReferences" in data:
for ref in data["uniProtKBCrossReferences"]:
if ref.get("database") == "GO":
go_id = ref.get("id", "")
go_desc = ""
go_aspect = ""
if "properties" in ref:
for prop in ref["properties"]:
if prop.get("key") == "GoTerm":
go_desc = prop.get("value", "").split(":")[1] if ":" in prop.get("value", "") else ""
elif prop.get("key") == "GoEvidenceType":
go_aspect = prop.get("value", "")
go_entry = {"id": go_id, "term": go_desc, "evidence": go_aspect}
# GO aspectに基づく分類
if go_id.startswith("GO:"):
if "F:" in go_desc:
go_terms["molecular_function"].append(go_entry)
elif "P:" in go_desc:
go_terms["biological_process"].append(go_entry)
elif "C:" in go_desc:
go_terms["cellular_component"].append(go_entry)
return go_terms
def get_protein_interactions(self, accession: str, max_interactions: int = 50) -> List[Dict]:
"""
タンパク質相互作用情報の取得
Args:
accession: UniProtアクセッション
max_interactions: 最大取得相互作用数
Returns:
List[Dict]: 相互作用情報
"""
try:
# STRING データベースAPIを使用
string_url = "https://string-db.org/api/json/network"
params = {
"identifiers": accession,
"species": 9606, # human
"limit": max_interactions
}
response = self.session.get(string_url, params=params)
response.raise_for_status()
interactions = []
for interaction in response.json():
interactions.append({
"partner_a": interaction.get("preferredName_A", ""),
"partner_b": interaction.get("preferredName_B", ""),
"score": interaction.get("score", 0),
"interaction_type": "protein-protein"
})
return interactions
except requests.exceptions.RequestException as e:
print(f"相互作用情報取得エラー: {e}")
return []
finally:
time.sleep(self.rate_limit)
def analyze_protein_family(self, gene_family: str, organisms: List[str] = None) -> pd.DataFrame:
"""
タンパク質ファミリーの比較解析
Args:
gene_family: 遺伝子ファミリー名(例:"histone", "kinase")
organisms: 対象生物種のリスト
Returns:
pd.DataFrame: ファミリー解析結果
"""
if organisms is None:
organisms = ["human", "mouse", "rat", "zebrafish"]
family_data = []
for organism in organisms:
print(f"{organism}で{gene_family}ファミリーを検索中...")
# ファミリー検索
results = self.search_proteins(
query=gene_family,
organism=organism,
reviewed=True,
max_results=100,
fields=self.common_fields["basic"] + ["cc_function", "go_f"]
)
if not results.empty:
results["organism"] = organism
results["family"] = gene_family
family_data.append(results)
time.sleep(self.rate_limit)
if family_data:
combined_df = pd.concat(family_data, ignore_index=True)
# ファミリー統計
print(f"\n{gene_family}ファミリー解析結果:")
print(f"- 総タンパク質数: {len(combined_df)}")
print(f"- 生物種別分布:")
print(combined_df["organism"].value_counts().to_string())
return combined_df
else:
return pd.DataFrame()
def functional_enrichment_analysis(self, protein_list: List[str]) -> Dict:
"""
タンパク質リストの機能的濃縮解析
Args:
protein_list: UniProtアクセッションのリスト
Returns:
Dict: 濃縮解析結果
"""
# タンパク質の詳細情報を取得
proteins_data = []
for accession in protein_list[:20]: # 制限
details = self.get_protein_details(accession, include_features=False)
if details:
proteins_data.append(details)
time.sleep(self.rate_limit)
if not proteins_data:
return {}
# GO term集計
go_counts = {"molecular_function": {}, "biological_process": {}, "cellular_component": {}}
for protein in proteins_data:
go_annotations = protein.get("go_annotations", {})
for category, terms in go_annotations.items():
for term in terms:
term_id = term.get("term", "Unknown")
if term_id in go_counts[category]:
go_counts[category][term_id] += 1
else:
go_counts[category][term_id] = 1
# パスウェイ集計
pathway_counts = {}
for protein in proteins_data:
pathways = protein.get("pathways", [])
for pathway in pathways:
if pathway in pathway_counts:
pathway_counts[pathway] += 1
else:
pathway_counts[pathway] = 1
# 結果をランク順にソート
enrichment_results = {
"go_molecular_function": sorted(go_counts["molecular_function"].items(),
key=lambda x: x[1], reverse=True)[:10],
"go_biological_process": sorted(go_counts["biological_process"].items(),
key=lambda x: x[1], reverse=True)[:10],
"go_cellular_component": sorted(go_counts["cellular_component"].items(),
key=lambda x: x[1], reverse=True)[:10],
"pathways": sorted(pathway_counts.items(), key=lambda x: x[1], reverse=True)[:10],
"total_proteins": len(proteins_data)
}
return enrichment_results
# 使用例
if __name__ == "__main__":
# UniProt解析ツール初期化
uniprot = UniProtAnalyzer(rate_limit=1.0)
# 1. BRCA1タンパク質の詳細解析
print("=== BRCA1タンパク質詳細解析 ===")
brca1_details = uniprot.get_protein_details("P38398",
include_features=True,
include_interactions=True)
if brca1_details:
print(f"タンパク質名: {brca1_details['protein_names']['recommended']}")
print(f"遺伝子名: {brca1_details['gene_names']['primary']}")
print(f"配列長: {brca1_details['sequence_info']['length']} aa")
print(f"分子量: {brca1_details['sequence_info']['mass']} Da")
print(f"相互作用数: {len(brca1_details.get('interactions', []))}")
# 2. キナーゼファミリーの比較解析
print("\n=== キナーゼファミリー比較解析 ===")
kinase_family = uniprot.analyze_protein_family(
gene_family="protein kinase",
organisms=["human", "mouse"]
)
if not kinase_family.empty:
print("\n上位10タンパク質:")
print(kinase_family.head(10)[["Entry", "Gene Names", "Protein names", "organism"]].to_string())
# 3. がん関連タンパク質の機能的濃縮解析
print("\n=== がん関連タンパク質の機能的濃縮解析 ===")
cancer_proteins = ["P53_HUMAN", "P38398", "P04637", "P21359", "Q02952"] # 例
# アクセッション形式に変換
cancer_search = uniprot.search_proteins("cancer AND tumor suppressor",
organism="human",
reviewed=True,
max_results=20)
if not cancer_search.empty:
cancer_accessions = cancer_search["Entry"].tolist()[:10]
enrichment = uniprot.functional_enrichment_analysis(cancer_accessions)
print(f"解析対象タンパク質数: {enrichment.get('total_proteins', 0)}")
print("\n上位GO Molecular Function:")
for term, count in enrichment.get("go_molecular_function", [])[:5]:
print(f" {term}: {count}件")
print("\n上位GO Biological Process:")
for term, count in enrichment.get("go_biological_process", [])[:5]:
print(f" {term}: {count}件")