I.3 専門データベースの戦略的活用
I.3.1 臨床データベースの統合的活用
ClinVar + COSMIC + PharmGKB の統合解析
import requests
import pandas as pd
import json
from typing import List, Dict, Optional
import time
import sqlite3
from pathlib import Path
import xml.etree.ElementTree as ET
class ClinicalVariantAnalyzer:
"""臨床変異データベースの統合解析プラットフォーム"""
def __init__(self, cache_dir: str = "./clinical_cache"):
"""
Args:
cache_dir: データキャッシュディレクトリ
"""
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
# SQLiteキャッシュデータベース
self.db_path = self.cache_dir / "clinical_variants.db"
self._init_cache_db()
# API設定
self.apis = {
"clinvar": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/",
"cosmic": "https://cancer.sanger.ac.uk/cosmic/",
"pharmgkb": "https://api.pharmgkb.org/v1/",
"ensembl": "https://rest.ensembl.org/"
}
self.rate_limits = {
"clinvar": 1.0, # 1秒間隔
"cosmic": 2.0, # 2秒間隔
"pharmgkb": 1.5, # 1.5秒間隔
"ensembl": 0.5 # 0.5秒間隔
}
def _init_cache_db(self):
"""キャッシュデータベースの初期化"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# ClinVarテーブル
cursor.execute('''
CREATE TABLE IF NOT EXISTS clinvar_variants (
id INTEGER PRIMARY KEY,
variation_id TEXT UNIQUE,
gene_symbol TEXT,
hgvs_c TEXT,
hgvs_p TEXT,
clinical_significance TEXT,
condition TEXT,
review_status TEXT,
last_updated TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# COSMICテーブル
cursor.execute('''
CREATE TABLE IF NOT EXISTS cosmic_variants (
id INTEGER PRIMARY KEY,
cosmic_id TEXT UNIQUE,
gene_symbol TEXT,
mutation_type TEXT,
mutation_description TEXT,
tissue_type TEXT,
histology TEXT,
primary_site TEXT,
sample_source TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# PharmGKBテーブル
cursor.execute('''
CREATE TABLE IF NOT EXISTS pharmgkb_variants (
id INTEGER PRIMARY KEY,
rsid TEXT,
gene_symbol TEXT,
variant_annotation TEXT,
drug_association TEXT,
phenotype TEXT,
evidence_level TEXT,
population TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def search_clinvar_variants(self, gene_symbol: str, max_results: int = 100) -> pd.DataFrame:
"""
ClinVarでの変異検索
Args:
gene_symbol: 遺伝子シンボル
max_results: 最大取得件数
Returns:
pd.DataFrame: ClinVar変異データ
"""
try:
# ESearch APIで変異ID取得
search_url = f"{self.apis['clinvar']}esearch.fcgi"
search_params = {
"db": "clinvar",
"term": f"{gene_symbol}[gene] AND single_gene[prop]",
"retmax": max_results,
"retmode": "json"
}
response = requests.get(search_url, params=search_params)
response.raise_for_status()
search_data = response.json()
if not search_data["esearchresult"]["idlist"]:
print(f"ClinVarで{gene_symbol}の変異が見つかりませんでした")
return pd.DataFrame()
variant_ids = search_data["esearchresult"]["idlist"]
# ESummary APIで詳細情報取得
summary_url = f"{self.apis['clinvar']}esummary.fcgi"
summary_params = {
"db": "clinvar",
"id": ",".join(variant_ids),
"retmode": "json"
}
time.sleep(self.rate_limits["clinvar"])
response = requests.get(summary_url, params=summary_params)
response.raise_for_status()
summary_data = response.json()
# データ整形
variants = []
for variant_id, data in summary_data["result"].items():
if variant_id == "uids":
continue
variant_info = {
"variation_id": variant_id,
"gene_symbol": gene_symbol,
"title": data.get("title", ""),
"clinical_significance": data.get("clinical_significance", {}).get("description", ""),
"review_status": data.get("clinical_significance", {}).get("review_status", ""),
"condition": data.get("trait_set", [{}])[0].get("trait_name", "") if data.get("trait_set") else "",
"variation_type": data.get("variation_type", ""),
"last_updated": data.get("last_updated", "")
}
variants.append(variant_info)
df = pd.DataFrame(variants)
print(f"ClinVar検索結果: {len(df)}件の変異")
# キャッシュに保存
self._cache_clinvar_data(df)
return df
except requests.exceptions.RequestException as e:
print(f"ClinVar検索エラー: {e}")
return pd.DataFrame()
def search_cosmic_variants(self, gene_symbol: str, cancer_type: Optional[str] = None) -> pd.DataFrame:
"""
COSMICでのがん変異検索
Args:
gene_symbol: 遺伝子シンボル
cancer_type: がん種(オプション)
Returns:
pd.DataFrame: COSMIC変異データ
"""
# 注意: 実際のCOSMIC APIは認証が必要です。ここではダミーデータを生成
print(f"COSMIC検索: {gene_symbol}")
# ダミーデータ生成(実際の実装では認証付きAPIアクセス)
dummy_data = []
for i in range(10):
dummy_data.append({
"cosmic_id": f"COSM{1000000 + i}",
"gene_symbol": gene_symbol,
"mutation_type": "Substitution - Missense",
"mutation_description": f"c.{100 + i*10}G>A p.V{34 + i}M",
"tissue_type": "Carcinoma",
"histology": "adenocarcinoma",
"primary_site": "lung" if cancer_type == "lung" else "breast",
"sample_source": "surgical resection"
})
df = pd.DataFrame(dummy_data)
print(f"COSMIC検索結果: {len(df)}件の変異")
return df
def search_pharmgkb_variants(self, gene_symbol: str) -> pd.DataFrame:
"""
PharmGKBでの薬理遺伝学変異検索
Args:
gene_symbol: 遺伝子シンボル
Returns:
pd.DataFrame: PharmGKB変異データ
"""
try:
# PharmGKB REST API
api_url = f"{self.apis['pharmgkb']}data/variantAnnotation"
params = {
"gene": gene_symbol,
"format": "json"
}
# 注意: 実際のAPIは認証が必要な場合があります
# ここではダミーデータで代替
print(f"PharmGKB検索: {gene_symbol}")
dummy_pharmgkb_data = []
if gene_symbol.upper() in ["CYP2D6", "CYP2C19", "SLCO1B1", "DPYD"]:
for i, drug in enumerate(["warfarin", "clopidogrel", "simvastatin"][:3]):
dummy_pharmgkb_data.append({
"rsid": f"rs{1000000 + i}",
"gene_symbol": gene_symbol,
"variant_annotation": f"{gene_symbol}*{i+2}",
"drug_association": drug,
"phenotype": "altered drug metabolism",
"evidence_level": "1A",
"population": "European"
})
df = pd.DataFrame(dummy_pharmgkb_data)
print(f"PharmGKB検索結果: {len(df)}件の薬理遺伝学変異")
return df
except Exception as e:
print(f"PharmGKB検索エラー: {e}")
return pd.DataFrame()
def integrated_variant_analysis(self, gene_symbol: str,
include_population_data: bool = True) -> Dict:
"""
統合的変異解析
Args:
gene_symbol: 遺伝子シンボル
include_population_data: 集団データを含める場合True
Returns:
Dict: 統合解析結果
"""
print(f"\n=== {gene_symbol} 統合変異解析 ===")
# 各データベースから情報取得
clinvar_data = self.search_clinvar_variants(gene_symbol, max_results=50)
cosmic_data = self.search_cosmic_variants(gene_symbol)
pharmgkb_data = self.search_pharmgkb_variants(gene_symbol)
# 集団データ(gnomAD等からの頻度情報)
population_data = {}
if include_population_data:
population_data = self._get_population_frequencies(gene_symbol)
# 統合解析結果
analysis_results = {
"gene_symbol": gene_symbol,
"total_clinvar_variants": len(clinvar_data),
"total_cosmic_variants": len(cosmic_data),
"total_pharmgkb_variants": len(pharmgkb_data),
"clinvar_significance_distribution": {},
"cosmic_cancer_distribution": {},
"pharmgkb_drug_associations": {},
"high_impact_variants": [],
"population_frequencies": population_data
}
# ClinVar臨床的意義分布
if not clinvar_data.empty:
significance_counts = clinvar_data["clinical_significance"].value_counts()
analysis_results["clinvar_significance_distribution"] = significance_counts.to_dict()
# 高影響度変異の特定
high_impact = clinvar_data[
clinvar_data["clinical_significance"].str.contains(
"Pathogenic|Likely pathogenic", na=False, case=False
)
]
analysis_results["high_impact_variants"] = high_impact.to_dict('records')
# COSMICがん種分布
if not cosmic_data.empty:
cancer_counts = cosmic_data["primary_site"].value_counts()
analysis_results["cosmic_cancer_distribution"] = cancer_counts.to_dict()
# PharmGKB薬物関連
if not pharmgkb_data.empty:
drug_counts = pharmgkb_data["drug_association"].value_counts()
analysis_results["pharmgkb_drug_associations"] = drug_counts.to_dict()
return analysis_results
def _get_population_frequencies(self, gene_symbol: str) -> Dict:
"""集団頻度データの取得(gnomAD等)"""
# Ensembl REST APIを使用した例
try:
url = f"{self.apis['ensembl']}lookup/symbol/homo_sapiens/{gene_symbol}"
params = {"expand": "1"}
response = requests.get(url, params=params,
headers={"Content-Type": "application/json"})
if response.status_code == 200:
gene_data = response.json()
return {
"ensembl_gene_id": gene_data.get("id", ""),
"chromosome": gene_data.get("seq_region_name", ""),
"start": gene_data.get("start", 0),
"end": gene_data.get("end", 0),
"strand": gene_data.get("strand", 0),
"biotype": gene_data.get("biotype", "")
}
else:
return {"error": f"Ensembl lookup failed: {response.status_code}"}
except Exception as e:
return {"error": f"Population data error: {e}"}
def _cache_clinvar_data(self, df: pd.DataFrame):
"""ClinVarデータのキャッシュ保存"""
if df.empty:
return
conn = sqlite3.connect(self.db_path)
for _, row in df.iterrows():
try:
conn.execute('''
INSERT OR REPLACE INTO clinvar_variants
(variation_id, gene_symbol, clinical_significance,
condition, review_status, last_updated)
VALUES (?, ?, ?, ?, ?, ?)
''', (
row.get("variation_id", ""),
row.get("gene_symbol", ""),
row.get("clinical_significance", ""),
row.get("condition", ""),
row.get("review_status", ""),
row.get("last_updated", "")
))
except Exception as e:
print(f"キャッシュ保存エラー: {e}")
conn.commit()
conn.close()
def generate_clinical_report(self, gene_symbol: str, output_format: str = "markdown") -> str:
"""
臨床変異レポートの生成
Args:
gene_symbol: 遺伝子シンボル
output_format: 出力形式("markdown", "html", "json")
Returns:
str: レポート内容
"""
# 統合解析実行
analysis = self.integrated_variant_analysis(gene_symbol)
if output_format == "markdown":
report = f"""# {gene_symbol} 臨床変異解析レポート
## 概要
- **遺伝子**: {gene_symbol}
- **ClinVar変異数**: {analysis['total_clinvar_variants']}
- **COSMIC変異数**: {analysis['total_cosmic_variants']}
- **PharmGKB薬理遺伝学変異数**: {analysis['total_pharmgkb_variants']}
## ClinVar臨床的意義分布
"""
for significance, count in analysis["clinvar_significance_distribution"].items():
report += f"- {significance}: {count}件\n"
report += "\n## 高影響度変異\n"
for i, variant in enumerate(analysis["high_impact_variants"][:5], 1):
report += f"{i}. {variant.get('title', 'N/A')} - {variant.get('clinical_significance', 'N/A')}\n"
if analysis["cosmic_cancer_distribution"]:
report += "\n## COSMIC がん種分布\n"
for cancer, count in analysis["cosmic_cancer_distribution"].items():
report += f"- {cancer}: {count}件\n"
if analysis["pharmgkb_drug_associations"]:
report += "\n## PharmGKB 薬物関連\n"
for drug, count in analysis["pharmgkb_drug_associations"].items():
report += f"- {drug}: {count}件\n"
report += f"\n---\n生成日時: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
return report
elif output_format == "json":
return json.dumps(analysis, indent=2, ensure_ascii=False)
else:
return str(analysis)
# 使用例
if __name__ == "__main__":
# 臨床変異解析器の初期化
analyzer = ClinicalVariantAnalyzer(cache_dir="./clinical_analysis_cache")
# 主要ながん関連遺伝子の解析
cancer_genes = ["BRCA1", "TP53", "EGFR", "KRAS"]
for gene in cancer_genes:
print(f"\n{'='*50}")
print(f"{gene} 遺伝子の臨床変異解析")
print(f"{'='*50}")
# 統合解析実行
results = analyzer.integrated_variant_analysis(gene, include_population_data=True)
# 結果要約
print(f"\n解析結果サマリー:")
print(f"- ClinVar変異: {results['total_clinvar_variants']}件")
print(f"- COSMIC変異: {results['total_cosmic_variants']}件")
print(f"- PharmGKB変異: {results['total_pharmgkb_variants']}件")
# レポート生成
report = analyzer.generate_clinical_report(gene, output_format="markdown")
# ファイル出力
report_file = Path(f"./clinical_report_{gene}.md")
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f"レポート保存: {report_file}")
# 待機
time.sleep(2)
print("\n全ての解析が完了しました。")