I.3 専門データベースの戦略的活用

I.3.1 臨床データベースの統合的活用

ClinVar + COSMIC + PharmGKB の統合解析

import requests
import pandas as pd
import json
from typing import List, Dict, Optional
import time
import sqlite3
from pathlib import Path
import xml.etree.ElementTree as ET

class ClinicalVariantAnalyzer:
    """臨床変異データベースの統合解析プラットフォーム"""
    
    def __init__(self, cache_dir: str = "./clinical_cache"):
        """
        Args:
            cache_dir: データキャッシュディレクトリ
        """
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        
        # SQLiteキャッシュデータベース
        self.db_path = self.cache_dir / "clinical_variants.db"
        self._init_cache_db()
        
        # API設定
        self.apis = {
            "clinvar": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/",
            "cosmic": "https://cancer.sanger.ac.uk/cosmic/",
            "pharmgkb": "https://api.pharmgkb.org/v1/",
            "ensembl": "https://rest.ensembl.org/"
        }
        
        self.rate_limits = {
            "clinvar": 1.0,    # 1秒間隔
            "cosmic": 2.0,     # 2秒間隔
            "pharmgkb": 1.5,   # 1.5秒間隔
            "ensembl": 0.5     # 0.5秒間隔
        }
    
    def _init_cache_db(self):
        """キャッシュデータベースの初期化"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # ClinVarテーブル
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS clinvar_variants (
                id INTEGER PRIMARY KEY,
                variation_id TEXT UNIQUE,
                gene_symbol TEXT,
                hgvs_c TEXT,
                hgvs_p TEXT,
                clinical_significance TEXT,
                condition TEXT,
                review_status TEXT,
                last_updated TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        # COSMICテーブル
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS cosmic_variants (
                id INTEGER PRIMARY KEY,
                cosmic_id TEXT UNIQUE,
                gene_symbol TEXT,
                mutation_type TEXT,
                mutation_description TEXT,
                tissue_type TEXT,
                histology TEXT,
                primary_site TEXT,
                sample_source TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        # PharmGKBテーブル
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS pharmgkb_variants (
                id INTEGER PRIMARY KEY,
                rsid TEXT,
                gene_symbol TEXT,
                variant_annotation TEXT,
                drug_association TEXT,
                phenotype TEXT,
                evidence_level TEXT,
                population TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def search_clinvar_variants(self, gene_symbol: str, max_results: int = 100) -> pd.DataFrame:
        """
        ClinVarでの変異検索
        
        Args:
            gene_symbol: 遺伝子シンボル
            max_results: 最大取得件数
        
        Returns:
            pd.DataFrame: ClinVar変異データ
        """
        try:
            # ESearch APIで変異ID取得
            search_url = f"{self.apis['clinvar']}esearch.fcgi"
            search_params = {
                "db": "clinvar",
                "term": f"{gene_symbol}[gene] AND single_gene[prop]",
                "retmax": max_results,
                "retmode": "json"
            }
            
            response = requests.get(search_url, params=search_params)
            response.raise_for_status()
            search_data = response.json()
            
            if not search_data["esearchresult"]["idlist"]:
                print(f"ClinVarで{gene_symbol}の変異が見つかりませんでした")
                return pd.DataFrame()
            
            variant_ids = search_data["esearchresult"]["idlist"]
            
            # ESummary APIで詳細情報取得
            summary_url = f"{self.apis['clinvar']}esummary.fcgi"
            summary_params = {
                "db": "clinvar",
                "id": ",".join(variant_ids),
                "retmode": "json"
            }
            
            time.sleep(self.rate_limits["clinvar"])
            response = requests.get(summary_url, params=summary_params)
            response.raise_for_status()
            summary_data = response.json()
            
            # データ整形
            variants = []
            for variant_id, data in summary_data["result"].items():
                if variant_id == "uids":
                    continue
                
                variant_info = {
                    "variation_id": variant_id,
                    "gene_symbol": gene_symbol,
                    "title": data.get("title", ""),
                    "clinical_significance": data.get("clinical_significance", {}).get("description", ""),
                    "review_status": data.get("clinical_significance", {}).get("review_status", ""),
                    "condition": data.get("trait_set", [{}])[0].get("trait_name", "") if data.get("trait_set") else "",
                    "variation_type": data.get("variation_type", ""),
                    "last_updated": data.get("last_updated", "")
                }
                
                variants.append(variant_info)
            
            df = pd.DataFrame(variants)
            print(f"ClinVar検索結果: {len(df)}件の変異")
            
            # キャッシュに保存
            self._cache_clinvar_data(df)
            
            return df
            
        except requests.exceptions.RequestException as e:
            print(f"ClinVar検索エラー: {e}")
            return pd.DataFrame()
    
    def search_cosmic_variants(self, gene_symbol: str, cancer_type: Optional[str] = None) -> pd.DataFrame:
        """
        COSMICでのがん変異検索
        
        Args:
            gene_symbol: 遺伝子シンボル
            cancer_type: がん種(オプション)
        
        Returns:
            pd.DataFrame: COSMIC変異データ
        """
        # 注意: 実際のCOSMIC APIは認証が必要です。ここではダミーデータを生成
        print(f"COSMIC検索: {gene_symbol}")
        
        # ダミーデータ生成(実際の実装では認証付きAPIアクセス)
        dummy_data = []
        for i in range(10):
            dummy_data.append({
                "cosmic_id": f"COSM{1000000 + i}",
                "gene_symbol": gene_symbol,
                "mutation_type": "Substitution - Missense",
                "mutation_description": f"c.{100 + i*10}G>A p.V{34 + i}M",
                "tissue_type": "Carcinoma",
                "histology": "adenocarcinoma",
                "primary_site": "lung" if cancer_type == "lung" else "breast",
                "sample_source": "surgical resection"
            })
        
        df = pd.DataFrame(dummy_data)
        print(f"COSMIC検索結果: {len(df)}件の変異")
        
        return df
    
    def search_pharmgkb_variants(self, gene_symbol: str) -> pd.DataFrame:
        """
        PharmGKBでの薬理遺伝学変異検索
        
        Args:
            gene_symbol: 遺伝子シンボル
        
        Returns:
            pd.DataFrame: PharmGKB変異データ
        """
        try:
            # PharmGKB REST API
            api_url = f"{self.apis['pharmgkb']}data/variantAnnotation"
            params = {
                "gene": gene_symbol,
                "format": "json"
            }
            
            # 注意: 実際のAPIは認証が必要な場合があります
            # ここではダミーデータで代替
            print(f"PharmGKB検索: {gene_symbol}")
            
            dummy_pharmgkb_data = []
            if gene_symbol.upper() in ["CYP2D6", "CYP2C19", "SLCO1B1", "DPYD"]:
                for i, drug in enumerate(["warfarin", "clopidogrel", "simvastatin"][:3]):
                    dummy_pharmgkb_data.append({
                        "rsid": f"rs{1000000 + i}",
                        "gene_symbol": gene_symbol,
                        "variant_annotation": f"{gene_symbol}*{i+2}",
                        "drug_association": drug,
                        "phenotype": "altered drug metabolism",
                        "evidence_level": "1A",
                        "population": "European"
                    })
            
            df = pd.DataFrame(dummy_pharmgkb_data)
            print(f"PharmGKB検索結果: {len(df)}件の薬理遺伝学変異")
            
            return df
            
        except Exception as e:
            print(f"PharmGKB検索エラー: {e}")
            return pd.DataFrame()
    
    def integrated_variant_analysis(self, gene_symbol: str, 
                                  include_population_data: bool = True) -> Dict:
        """
        統合的変異解析
        
        Args:
            gene_symbol: 遺伝子シンボル
            include_population_data: 集団データを含める場合True
        
        Returns:
            Dict: 統合解析結果
        """
        print(f"\n=== {gene_symbol} 統合変異解析 ===")
        
        # 各データベースから情報取得
        clinvar_data = self.search_clinvar_variants(gene_symbol, max_results=50)
        cosmic_data = self.search_cosmic_variants(gene_symbol)
        pharmgkb_data = self.search_pharmgkb_variants(gene_symbol)
        
        # 集団データ(gnomAD等からの頻度情報)
        population_data = {}
        if include_population_data:
            population_data = self._get_population_frequencies(gene_symbol)
        
        # 統合解析結果
        analysis_results = {
            "gene_symbol": gene_symbol,
            "total_clinvar_variants": len(clinvar_data),
            "total_cosmic_variants": len(cosmic_data),
            "total_pharmgkb_variants": len(pharmgkb_data),
            "clinvar_significance_distribution": {},
            "cosmic_cancer_distribution": {},
            "pharmgkb_drug_associations": {},
            "high_impact_variants": [],
            "population_frequencies": population_data
        }
        
        # ClinVar臨床的意義分布
        if not clinvar_data.empty:
            significance_counts = clinvar_data["clinical_significance"].value_counts()
            analysis_results["clinvar_significance_distribution"] = significance_counts.to_dict()
            
            # 高影響度変異の特定
            high_impact = clinvar_data[
                clinvar_data["clinical_significance"].str.contains(
                    "Pathogenic|Likely pathogenic", na=False, case=False
                )
            ]
            analysis_results["high_impact_variants"] = high_impact.to_dict('records')
        
        # COSMICがん種分布
        if not cosmic_data.empty:
            cancer_counts = cosmic_data["primary_site"].value_counts()
            analysis_results["cosmic_cancer_distribution"] = cancer_counts.to_dict()
        
        # PharmGKB薬物関連
        if not pharmgkb_data.empty:
            drug_counts = pharmgkb_data["drug_association"].value_counts()
            analysis_results["pharmgkb_drug_associations"] = drug_counts.to_dict()
        
        return analysis_results
    
    def _get_population_frequencies(self, gene_symbol: str) -> Dict:
        """集団頻度データの取得(gnomAD等)"""
        # Ensembl REST APIを使用した例
        try:
            url = f"{self.apis['ensembl']}lookup/symbol/homo_sapiens/{gene_symbol}"
            params = {"expand": "1"}
            
            response = requests.get(url, params=params, 
                                  headers={"Content-Type": "application/json"})
            
            if response.status_code == 200:
                gene_data = response.json()
                return {
                    "ensembl_gene_id": gene_data.get("id", ""),
                    "chromosome": gene_data.get("seq_region_name", ""),
                    "start": gene_data.get("start", 0),
                    "end": gene_data.get("end", 0),
                    "strand": gene_data.get("strand", 0),
                    "biotype": gene_data.get("biotype", "")
                }
            else:
                return {"error": f"Ensembl lookup failed: {response.status_code}"}
                
        except Exception as e:
            return {"error": f"Population data error: {e}"}
    
    def _cache_clinvar_data(self, df: pd.DataFrame):
        """ClinVarデータのキャッシュ保存"""
        if df.empty:
            return
        
        conn = sqlite3.connect(self.db_path)
        
        for _, row in df.iterrows():
            try:
                conn.execute('''
                    INSERT OR REPLACE INTO clinvar_variants 
                    (variation_id, gene_symbol, clinical_significance, 
                     condition, review_status, last_updated)
                    VALUES (?, ?, ?, ?, ?, ?)
                ''', (
                    row.get("variation_id", ""),
                    row.get("gene_symbol", ""),
                    row.get("clinical_significance", ""),
                    row.get("condition", ""),
                    row.get("review_status", ""),
                    row.get("last_updated", "")
                ))
            except Exception as e:
                print(f"キャッシュ保存エラー: {e}")
        
        conn.commit()
        conn.close()
    
    def generate_clinical_report(self, gene_symbol: str, output_format: str = "markdown") -> str:
        """
        臨床変異レポートの生成
        
        Args:
            gene_symbol: 遺伝子シンボル
            output_format: 出力形式("markdown", "html", "json")
        
        Returns:
            str: レポート内容
        """
        # 統合解析実行
        analysis = self.integrated_variant_analysis(gene_symbol)
        
        if output_format == "markdown":
            report = f"""# {gene_symbol} 臨床変異解析レポート

## 概要
- **遺伝子**: {gene_symbol}
- **ClinVar変異数**: {analysis['total_clinvar_variants']}
- **COSMIC変異数**: {analysis['total_cosmic_variants']}
- **PharmGKB薬理遺伝学変異数**: {analysis['total_pharmgkb_variants']}

## ClinVar臨床的意義分布
"""
            for significance, count in analysis["clinvar_significance_distribution"].items():
                report += f"- {significance}: {count}\n"
            
            report += "\n## 高影響度変異\n"
            for i, variant in enumerate(analysis["high_impact_variants"][:5], 1):
                report += f"{i}. {variant.get('title', 'N/A')} - {variant.get('clinical_significance', 'N/A')}\n"
            
            if analysis["cosmic_cancer_distribution"]:
                report += "\n## COSMIC がん種分布\n"
                for cancer, count in analysis["cosmic_cancer_distribution"].items():
                    report += f"- {cancer}: {count}\n"
            
            if analysis["pharmgkb_drug_associations"]:
                report += "\n## PharmGKB 薬物関連\n"
                for drug, count in analysis["pharmgkb_drug_associations"].items():
                    report += f"- {drug}: {count}\n"
            
            report += f"\n---\n生成日時: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
            
            return report
        
        elif output_format == "json":
            return json.dumps(analysis, indent=2, ensure_ascii=False)
        
        else:
            return str(analysis)

# 使用例
if __name__ == "__main__":
    # 臨床変異解析器の初期化
    analyzer = ClinicalVariantAnalyzer(cache_dir="./clinical_analysis_cache")
    
    # 主要ながん関連遺伝子の解析
    cancer_genes = ["BRCA1", "TP53", "EGFR", "KRAS"]
    
    for gene in cancer_genes:
        print(f"\n{'='*50}")
        print(f"{gene} 遺伝子の臨床変異解析")
        print(f"{'='*50}")
        
        # 統合解析実行
        results = analyzer.integrated_variant_analysis(gene, include_population_data=True)
        
        # 結果要約
        print(f"\n解析結果サマリー:")
        print(f"- ClinVar変異: {results['total_clinvar_variants']}")
        print(f"- COSMIC変異: {results['total_cosmic_variants']}")
        print(f"- PharmGKB変異: {results['total_pharmgkb_variants']}")
        
        # レポート生成
        report = analyzer.generate_clinical_report(gene, output_format="markdown")
        
        # ファイル出力
        report_file = Path(f"./clinical_report_{gene}.md")
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report)
        
        print(f"レポート保存: {report_file}")
        
        # 待機
        time.sleep(2)
    
    print("\n全ての解析が完了しました。")