I.2 主要データベースの実践的活用法

I.2.1 NCBI データベース群の効率的利用

GenBank/RefSeq: 配列データの取得と品質管理

from Bio import Entrez, SeqIO
import requests
import pandas as pd
from typing import List, Dict, Optional
import time
import logging

class NCBIDataRetriever:
    """NCBI データベースからの効率的なデータ取得"""
    
    def __init__(self, email: str, api_key: Optional[str] = None):
        """
        Args:
            email: NCBI利用規約に必要なメールアドレス
            api_key: API利用制限緩和のためのキー(推奨)
        """
        Entrez.email = email
        if api_key:
            Entrez.api_key = api_key
        
        # ログ設定
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # レート制限設定
        self.request_delay = 0.34 if api_key else 1.0  # API keyあり: 10 req/sec, なし: 3 req/sec
    
    def search_sequences(self, query: str, database: str = "nucleotide", 
                        max_results: int = 100, filters: Optional[Dict] = None) -> List[str]:
        """
        配列の検索とID取得
        
        Args:
            query: 検索クエリ(例:"BRCA1[Gene] AND human[Organism]")
            database: 検索対象データベース
            max_results: 取得する最大件数
            filters: フィルタ条件
        
        Returns:
            List[str]: GenBank Accession IDs
        """
        try:
            # フィルタ適用
            if filters:
                filter_terms = []
                if "organism" in filters:
                    filter_terms.append(f"{filters['organism']}[Organism]")
                if "molecular_type" in filters:
                    filter_terms.append(f"{filters['molecular_type']}[Properties]")
                if "date_range" in filters:
                    filter_terms.append(f"{filters['date_range']}[Publication Date]")
                
                if filter_terms:
                    query += " AND " + " AND ".join(filter_terms)
            
            self.logger.info(f"実行クエリ: {query}")
            
            # 検索実行
            handle = Entrez.esearch(db=database, term=query, retmax=max_results)
            search_results = Entrez.read(handle)
            handle.close()
            
            id_list = search_results["IdList"]
            self.logger.info(f"検索結果: {len(id_list)}")
            
            return id_list
            
        except Exception as e:
            self.logger.error(f"検索エラー: {e}")
            raise
    
    def fetch_sequences_batch(self, id_list: List[str], database: str = "nucleotide",
                             batch_size: int = 100, format: str = "fasta") -> Dict[str, str]:
        """
        バッチ処理による配列データ取得
        
        Args:
            id_list: GenBank IDs
            database: データベース名
            batch_size: バッチサイズ
            format: 出力フォーマット
        
        Returns:
            Dict[str, str]: ID -> 配列データのマッピング
        """
        sequences = {}
        
        # バッチ処理
        for i in range(0, len(id_list), batch_size):
            batch_ids = id_list[i:i + batch_size]
            
            try:
                self.logger.info(f"バッチ {i//batch_size + 1}: {len(batch_ids)}件処理中")
                
                # データ取得
                handle = Entrez.efetch(
                    db=database,
                    id=",".join(batch_ids),
                    rettype=format,
                    retmode="text"
                )
                
                # FASTA形式の場合
                if format == "fasta":
                    fasta_records = SeqIO.parse(handle, "fasta")
                    for record in fasta_records:
                        sequences[record.id] = str(record.seq)
                else:
                    # 他の形式(GenBank等)
                    sequences.update({id: handle.read() for id in batch_ids})
                
                handle.close()
                
                # レート制限
                time.sleep(self.request_delay)
                
            except Exception as e:
                self.logger.error(f"バッチ処理エラー (batch {i//batch_size + 1}): {e}")
                continue
        
        return sequences
    
    def get_gene_summary(self, gene_symbol: str, organism: str = "human") -> Dict:
        """
        遺伝子情報の包括的取得
        
        Args:
            gene_symbol: 遺伝子シンボル(例:BRCA1)
            organism: 生物種
        
        Returns:
            Dict: 遺伝子情報
        """
        try:
            # Gene データベースで検索
            gene_query = f"{gene_symbol}[Gene] AND {organism}[Organism]"
            gene_handle = Entrez.esearch(db="gene", term=gene_query, retmax=1)
            gene_results = Entrez.read(gene_handle)
            gene_handle.close()
            
            if not gene_results["IdList"]:
                return {"error": f"遺伝子 {gene_symbol} が見つかりません"}
            
            gene_id = gene_results["IdList"][0]
            
            # 詳細情報取得
            summary_handle = Entrez.esummary(db="gene", id=gene_id)
            summary = Entrez.read(summary_handle)[0]
            summary_handle.close()
            
            # 関連配列情報取得
            link_handle = Entrez.elink(dbfrom="gene", db="nucleotide", id=gene_id)
            link_results = Entrez.read(link_handle)
            link_handle.close()
            
            nucleotide_ids = []
            if link_results[0]["LinkSetDb"]:
                nucleotide_ids = [link["Id"] for link in link_results[0]["LinkSetDb"][0]["Link"]]
            
            return {
                "gene_id": gene_id,
                "symbol": summary.get("Name", ""),
                "description": summary.get("Description", ""),
                "summary": summary.get("Summary", ""),
                "chromosome": summary.get("Chromosome", ""),
                "map_location": summary.get("MapLocation", ""),
                "gene_type": summary.get("GeneType", ""),
                "associated_sequences": len(nucleotide_ids),
                "nucleotide_ids": nucleotide_ids[:10]  # 最初の10件のみ
            }
            
        except Exception as e:
            self.logger.error(f"遺伝子情報取得エラー: {e}")
            return {"error": str(e)}

# 使用例
if __name__ == "__main__":
    # 初期化(実際の利用時はメールアドレスとAPIキーを設定)
    retriever = NCBIDataRetriever(
        email="your.email@example.com",
        api_key="your_api_key_here"  # オプション
    )
    
    # 1. BRCA1遺伝子の基本情報取得
    brca1_info = retriever.get_gene_summary("BRCA1", "human")
    print("BRCA1遺伝子情報:")
    print(f"- 説明: {brca1_info.get('description', 'N/A')}")
    print(f"- 染色体: {brca1_info.get('chromosome', 'N/A')}")
    print(f"- 関連配列数: {brca1_info.get('associated_sequences', 'N/A')}")
    
    # 2. COVID-19関連配列の検索と取得
    covid_filters = {
        "organism": "SARS-CoV-2",
        "molecular_type": "genomic RNA",
        "date_range": "2020/01/01:2024/12/31"
    }
    
    covid_ids = retriever.search_sequences(
        query="complete genome",
        database="nucleotide",
        max_results=50,
        filters=covid_filters
    )
    
    if covid_ids:
        # バッチで配列取得
        covid_sequences = retriever.fetch_sequences_batch(
            covid_ids[:10],  # テスト用に10件のみ
            batch_size=5
        )
        
        print(f"\n取得したCOVID-19配列: {len(covid_sequences)}")
        for seq_id, sequence in list(covid_sequences.items())[:3]:
            print(f"- {seq_id}: {len(sequence)} bp")

SRA: シークエンシングデータの効率的アクセス

import subprocess
import os
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET
import requests
from typing import List, Dict, Optional
import concurrent.futures
import hashlib

class SRADataManager:
    """SRA(Sequence Read Archive)データの効率的管理"""
    
    def __init__(self, work_dir: str = "./sra_data", max_workers: int = 4):
        """
        Args:
            work_dir: 作業ディレクトリ
            max_workers: 並列ダウンロード数
        """
        self.work_dir = Path(work_dir)
        self.work_dir.mkdir(exist_ok=True)
        self.max_workers = max_workers
        
        # SRA Toolkitの確認
        self._check_sra_toolkit()
    
    def _check_sra_toolkit(self):
        """SRA Toolkitのインストール確認"""
        try:
            result = subprocess.run(["fastq-dump", "--version"], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                print(f"SRA Toolkit確認済み: {result.stdout.strip()}")
            else:
                raise FileNotFoundError
        except FileNotFoundError:
            print("警告: SRA Toolkitが見つかりません")
            print("インストール方法: conda install -c bioconda sra-tools")
    
    def search_sra_studies(self, query: str, max_results: int = 100) -> pd.DataFrame:
        """
        SRAスタディの検索
        
        Args:
            query: 検索クエリ(例:"RNA-seq AND human AND cancer")
            max_results: 最大取得件数
        
        Returns:
            pd.DataFrame: 検索結果
        """
        try:
            # ESearchでSRAスタディを検索
            search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
            search_params = {
                "db": "sra",
                "term": query,
                "retmax": max_results,
                "retmode": "xml"
            }
            
            response = requests.get(search_url, params=search_params)
            root = ET.fromstring(response.content)
            
            # SRA IDリストを取得
            sra_ids = [id_elem.text for id_elem in root.findall(".//Id")]
            
            if not sra_ids:
                print("検索結果なし")
                return pd.DataFrame()
            
            # 詳細情報を取得
            summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            summary_params = {
                "db": "sra",
                "id": ",".join(sra_ids),
                "retmode": "xml"
            }
            
            response = requests.get(summary_url, params=summary_params)
            root = ET.fromstring(response.content)
            
            # 結果をパース
            studies = []
            for doc_sum in root.findall(".//DocSum"):
                study_info = {"SRA_ID": doc_sum.find("Id").text}
                
                for item in doc_sum.findall(".//Item"):
                    name = item.get("Name")
                    if name in ["Title", "Platform", "Organism", "LibraryStrategy", 
                               "LibrarySource", "SampleAccession", "StudyAccession"]:
                        study_info[name] = item.text or ""
                
                studies.append(study_info)
            
            df = pd.DataFrame(studies)
            print(f"検索結果: {len(df)}件のSRAスタディを取得")
            
            return df
            
        except Exception as e:
            print(f"SRA検索エラー: {e}")
            return pd.DataFrame()
    
    def get_run_info(self, study_accession: str) -> pd.DataFrame:
        """
        スタディに含まれるランの詳細情報を取得
        
        Args:
            study_accession: SRAスタディアクセッション(例:SRP123456)
        
        Returns:
            pd.DataFrame: ラン情報
        """
        try:
            # RunInfoを取得
            url = f"https://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={study_accession}"
            
            response = requests.get(url)
            if response.status_code == 200:
                # CSVデータをDataFrameに変換
                from io import StringIO
                df = pd.read_csv(StringIO(response.text))
                
                # 重要な列のみ選択
                important_cols = [
                    "Run", "SampleName", "Experiment", "LibraryStrategy",
                    "LibrarySource", "Platform", "Instrument", "InsertSize",
                    "LibraryLayout", "spots", "bases", "download_path"
                ]
                
                available_cols = [col for col in important_cols if col in df.columns]
                df_filtered = df[available_cols]
                
                print(f"ラン情報取得: {len(df_filtered)}")
                return df_filtered
            else:
                print(f"RunInfo取得失敗: HTTP {response.status_code}")
                return pd.DataFrame()
                
        except Exception as e:
            print(f"RunInfo取得エラー: {e}")
            return pd.DataFrame()
    
    def download_fastq(self, run_accession: str, output_dir: Optional[str] = None,
                      paired: bool = True, compressed: bool = True) -> Dict[str, str]:
        """
        FASTQファイルのダウンロード
        
        Args:
            run_accession: ランアクセッション(例:SRR123456)
            output_dir: 出力ディレクトリ
            paired: ペアエンドデータの場合True
            compressed: 圧縮ファイルで保存する場合True
        
        Returns:
            Dict[str, str]: ダウンロードしたファイルのパス
        """
        if output_dir is None:
            output_dir = self.work_dir / "fastq"
        
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        try:
            # fastq-dumpコマンド構築
            cmd = ["fastq-dump"]
            
            if paired:
                cmd.extend(["--split-files"])  # ペアエンドファイルを分割
            
            if compressed:
                cmd.extend(["--gzip"])  # 圧縮して保存
            
            # 出力ディレクトリ指定
            cmd.extend(["--outdir", str(output_dir)])
            
            # ランアクセッション
            cmd.append(run_accession)
            
            print(f"FASTQダウンロード開始: {run_accession}")
            print(f"コマンド: {' '.join(cmd)}")
            
            # 実行
            result = subprocess.run(cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                # ダウンロードされたファイルを確認
                downloaded_files = {}
                suffix = ".fastq.gz" if compressed else ".fastq"
                
                if paired:
                    # ペアエンドファイル
                    for i in [1, 2]:
                        filename = f"{run_accession}_{i}{suffix}"
                        filepath = output_dir / filename
                        if filepath.exists():
                            downloaded_files[f"read_{i}"] = str(filepath)
                else:
                    # シングルエンドファイル
                    filename = f"{run_accession}{suffix}"
                    filepath = output_dir / filename
                    if filepath.exists():
                        downloaded_files["reads"] = str(filepath)
                
                print(f"ダウンロード完了: {len(downloaded_files)}ファイル")
                return downloaded_files
            else:
                print(f"ダウンロードエラー: {result.stderr}")
                return {}
                
        except Exception as e:
            print(f"FASTQダウンロードエラー: {e}")
            return {}
    
    def batch_download(self, run_list: List[str], max_concurrent: int = None) -> Dict[str, Dict]:
        """
        複数ランの並列ダウンロード
        
        Args:
            run_list: ランアクセッションのリスト
            max_concurrent: 最大並列数
        
        Returns:
            Dict[str, Dict]: ラン別ダウンロード結果
        """
        if max_concurrent is None:
            max_concurrent = self.max_workers
        
        results = {}
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:
            # 並列ダウンロード開始
            future_to_run = {
                executor.submit(self.download_fastq, run): run 
                for run in run_list
            }
            
            for future in concurrent.futures.as_completed(future_to_run):
                run = future_to_run[future]
                try:
                    result = future.result()
                    results[run] = result
                    print(f"完了: {run}")
                except Exception as e:
                    print(f"エラー {run}: {e}")
                    results[run] = {"error": str(e)}
        
        return results
    
    def verify_download_integrity(self, file_path: str, expected_md5: str = None) -> bool:
        """
        ダウンロードファイルの整合性確認
        
        Args:
            file_path: ファイルパス
            expected_md5: 期待されるMD5ハッシュ
        
        Returns:
            bool: 整合性確認結果
        """
        if not os.path.exists(file_path):
            return False
        
        if expected_md5:
            # MD5ハッシュ計算
            hasher = hashlib.md5()
            with open(file_path, 'rb') as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hasher.update(chunk)
            
            actual_md5 = hasher.hexdigest()
            return actual_md5 == expected_md5
        
        # MD5が提供されていない場合はファイル存在確認のみ
        return True

# 使用例
if __name__ == "__main__":
    # SRAデータマネージャー初期化
    sra_manager = SRADataManager(work_dir="./sra_analysis", max_workers=2)
    
    # 1. COVID-19関連RNA-seqデータの検索
    covid_studies = sra_manager.search_sra_studies(
        query="COVID-19 AND RNA-seq AND human",
        max_results=20
    )
    
    if not covid_studies.empty:
        print("\n検索結果(最初の5件):")
        print(covid_studies.head()[["StudyAccession", "Title", "Platform"]].to_string())
        
        # 2. 特定のスタディのラン情報取得
        first_study = covid_studies.iloc[0]["StudyAccession"]
        run_info = sra_manager.get_run_info(first_study)
        
        if not run_info.empty:
            print(f"\n{first_study}のラン情報:")
            print(run_info.head()[["Run", "LibraryStrategy", "spots", "bases"]].to_string())
            
            # 3. 小さなサンプルをダウンロード(テスト用)
            test_runs = run_info.head(2)["Run"].tolist()
            print(f"\nテストダウンロード開始: {test_runs}")
            
            # 個別ダウンロード
            for run in test_runs:
                result = sra_manager.download_fastq(run, compressed=True)
                if result:
                    print(f"ダウンロード成功 {run}: {list(result.keys())}")
                else:
                    print(f"ダウンロード失敗: {run}")

I.2.2 UniProt データベースの高度な活用

import requests
import pandas as pd
import json
from typing import List, Dict, Optional, Union
import time
import re
from urllib.parse import urlencode
import xml.etree.ElementTree as ET

class UniProtAnalyzer:
    """UniProtタンパク質データベースの包括的解析ツール"""
    
    def __init__(self, rate_limit: float = 1.0):
        """
        Args:
            rate_limit: リクエスト間の待機時間(秒)
        """
        self.base_url = "https://rest.uniprot.org"
        self.rate_limit = rate_limit
        self.session = requests.Session()
        
        # よく使用されるフィールド定義
        self.common_fields = {
            "basic": [
                "accession", "id", "gene_names", "protein_name", 
                "organism_name", "length", "mass"
            ],
            "sequence": [
                "accession", "sequence", "length", "mass", 
                "cc_subcellular_location", "ft_domain"
            ],
            "function": [
                "accession", "protein_name", "cc_function", 
                "go_c", "go_f", "go_p", "cc_pathway"
            ],
            "disease": [
                "accession", "gene_names", "cc_disease", 
                "cc_involvement_in_disease", "cc_allergen", "cc_toxic_dose"
            ],
            "structure": [
                "accession", "ft_helix", "ft_strand", "ft_turn",
                "ft_disulfid", "xref_pdb", "cc_similarity"
            ]
        }
    
    def search_proteins(self, query: str, organism: Optional[str] = None,
                       reviewed: Optional[bool] = None, max_results: int = 100,
                       fields: List[str] = None) -> pd.DataFrame:
        """
        タンパク質の検索
        
        Args:
            query: 検索クエリ(Gene name, protein name, keywords等)
            organism: 生物種(例:"human", "mouse", "9606")
            reviewed: Swiss-Protのみ検索する場合True
            max_results: 最大取得件数
            fields: 取得するフィールドのリスト
        
        Returns:
            pd.DataFrame: 検索結果
        """
        # クエリ構築
        search_terms = [query]
        
        if organism:
            if organism.isdigit():
                search_terms.append(f"taxonomy_id:{organism}")
            else:
                search_terms.append(f"organism:{organism}")
        
        if reviewed is not None:
            search_terms.append("reviewed:true" if reviewed else "reviewed:false")
        
        final_query = " AND ".join(search_terms)
        
        # フィールド設定
        if fields is None:
            fields = self.common_fields["basic"]
        
        # APIリクエスト
        params = {
            "query": final_query,
            "format": "tsv",
            "fields": ",".join(fields),
            "size": min(max_results, 500)  # API制限
        }
        
        try:
            response = self.session.get(f"{self.base_url}/uniprotkb/search", params=params)
            response.raise_for_status()
            
            # TSVデータをDataFrameに変換
            from io import StringIO
            df = pd.read_csv(StringIO(response.text), sep='\t')
            
            print(f"検索結果: {len(df)}件のタンパク質")
            return df
            
        except requests.exceptions.RequestException as e:
            print(f"UniProt検索エラー: {e}")
            return pd.DataFrame()
        
        finally:
            time.sleep(self.rate_limit)
    
    def get_protein_details(self, accession: str, 
                          include_features: bool = True,
                          include_interactions: bool = True) -> Dict:
        """
        特定タンパク質の詳細情報取得
        
        Args:
            accession: UniProtアクセッション
            include_features: 特徴情報を含める場合True
            include_interactions: 相互作用情報を含める場合True
        
        Returns:
            Dict: タンパク質詳細情報
        """
        try:
            # 基本情報取得
            response = self.session.get(f"{self.base_url}/uniprotkb/{accession}")
            response.raise_for_status()
            
            protein_data = response.json()
            
            # 構造化された情報を抽出
            details = {
                "accession": accession,
                "entry_name": protein_data.get("uniProtkbId", ""),
                "protein_names": self._extract_protein_names(protein_data),
                "gene_names": self._extract_gene_names(protein_data),
                "organism": self._extract_organism(protein_data),
                "sequence_info": self._extract_sequence_info(protein_data),
                "subcellular_location": self._extract_subcellular_location(protein_data),
                "function": self._extract_function(protein_data),
                "go_annotations": self._extract_go_annotations(protein_data),
                "pathways": self._extract_pathways(protein_data),
                "diseases": self._extract_diseases(protein_data)
            }
            
            # オプション情報
            if include_features:
                details["features"] = self._extract_features(protein_data)
            
            if include_interactions:
                details["interactions"] = self.get_protein_interactions(accession)
            
            return details
            
        except requests.exceptions.RequestException as e:
            print(f"タンパク質詳細取得エラー: {e}")
            return {}
        
        finally:
            time.sleep(self.rate_limit)
    
    def _extract_protein_names(self, data: Dict) -> Dict:
        """タンパク質名の抽出"""
        names = {"recommended": "", "alternative": [], "short": []}
        
        if "proteinDescription" in data:
            desc = data["proteinDescription"]
            if "recommendedName" in desc:
                names["recommended"] = desc["recommendedName"].get("fullName", {}).get("value", "")
            
            if "alternativeNames" in desc:
                for alt in desc["alternativeNames"]:
                    if "fullName" in alt:
                        names["alternative"].append(alt["fullName"].get("value", ""))
                    if "shortNames" in alt:
                        names["short"].extend([sn.get("value", "") for sn in alt["shortNames"]])
        
        return names
    
    def _extract_gene_names(self, data: Dict) -> Dict:
        """遺伝子名の抽出"""
        genes = {"primary": "", "synonyms": [], "ordered_locus": [], "orf": []}
        
        if "genes" in data:
            for gene in data["genes"]:
                if gene.get("geneName"):
                    genes["primary"] = gene["geneName"].get("value", "")
                
                if "synonyms" in gene:
                    genes["synonyms"].extend([syn.get("value", "") for syn in gene["synonyms"]])
                
                if "orderedLocusNames" in gene:
                    genes["ordered_locus"].extend([oln.get("value", "") for oln in gene["orderedLocusNames"]])
                
                if "orfNames" in gene:
                    genes["orf"].extend([orf.get("value", "") for orf in gene["orfNames"]])
        
        return genes
    
    def _extract_organism(self, data: Dict) -> Dict:
        """生物種情報の抽出"""
        organism = {"scientific_name": "", "common_name": "", "taxonomy_id": 0}
        
        if "organism" in data:
            org = data["organism"]
            organism["scientific_name"] = org.get("scientificName", "")
            organism["common_name"] = org.get("commonName", "")
            organism["taxonomy_id"] = org.get("taxonId", 0)
        
        return organism
    
    def _extract_sequence_info(self, data: Dict) -> Dict:
        """配列情報の抽出"""
        seq_info = {"length": 0, "mass": 0, "checksum": "", "sequence": ""}
        
        if "sequence" in data:
            seq = data["sequence"]
            seq_info["length"] = seq.get("length", 0)
            seq_info["mass"] = seq.get("molWeight", 0)
            seq_info["checksum"] = seq.get("crc64", "")
            seq_info["sequence"] = seq.get("value", "")
        
        return seq_info
    
    def _extract_go_annotations(self, data: Dict) -> Dict:
        """Gene Ontologyアノテーションの抽出"""
        go_terms = {"molecular_function": [], "biological_process": [], "cellular_component": []}
        
        if "uniProtKBCrossReferences" in data:
            for ref in data["uniProtKBCrossReferences"]:
                if ref.get("database") == "GO":
                    go_id = ref.get("id", "")
                    go_desc = ""
                    go_aspect = ""
                    
                    if "properties" in ref:
                        for prop in ref["properties"]:
                            if prop.get("key") == "GoTerm":
                                go_desc = prop.get("value", "").split(":")[1] if ":" in prop.get("value", "") else ""
                            elif prop.get("key") == "GoEvidenceType":
                                go_aspect = prop.get("value", "")
                    
                    go_entry = {"id": go_id, "term": go_desc, "evidence": go_aspect}
                    
                    # GO aspectに基づく分類
                    if go_id.startswith("GO:"):
                        if "F:" in go_desc:
                            go_terms["molecular_function"].append(go_entry)
                        elif "P:" in go_desc:
                            go_terms["biological_process"].append(go_entry)
                        elif "C:" in go_desc:
                            go_terms["cellular_component"].append(go_entry)
        
        return go_terms
    
    def get_protein_interactions(self, accession: str, max_interactions: int = 50) -> List[Dict]:
        """
        タンパク質相互作用情報の取得
        
        Args:
            accession: UniProtアクセッション
            max_interactions: 最大取得相互作用数
        
        Returns:
            List[Dict]: 相互作用情報
        """
        try:
            # STRING データベースAPIを使用
            string_url = "https://string-db.org/api/json/network"
            params = {
                "identifiers": accession,
                "species": 9606,  # human
                "limit": max_interactions
            }
            
            response = self.session.get(string_url, params=params)
            response.raise_for_status()
            
            interactions = []
            for interaction in response.json():
                interactions.append({
                    "partner_a": interaction.get("preferredName_A", ""),
                    "partner_b": interaction.get("preferredName_B", ""),
                    "score": interaction.get("score", 0),
                    "interaction_type": "protein-protein"
                })
            
            return interactions
            
        except requests.exceptions.RequestException as e:
            print(f"相互作用情報取得エラー: {e}")
            return []
        
        finally:
            time.sleep(self.rate_limit)
    
    def analyze_protein_family(self, gene_family: str, organisms: List[str] = None) -> pd.DataFrame:
        """
        タンパク質ファミリーの比較解析
        
        Args:
            gene_family: 遺伝子ファミリー名(例:"histone", "kinase")
            organisms: 対象生物種のリスト
        
        Returns:
            pd.DataFrame: ファミリー解析結果
        """
        if organisms is None:
            organisms = ["human", "mouse", "rat", "zebrafish"]
        
        family_data = []
        
        for organism in organisms:
            print(f"{organism}{gene_family}ファミリーを検索中...")
            
            # ファミリー検索
            results = self.search_proteins(
                query=gene_family,
                organism=organism,
                reviewed=True,
                max_results=100,
                fields=self.common_fields["basic"] + ["cc_function", "go_f"]
            )
            
            if not results.empty:
                results["organism"] = organism
                results["family"] = gene_family
                family_data.append(results)
            
            time.sleep(self.rate_limit)
        
        if family_data:
            combined_df = pd.concat(family_data, ignore_index=True)
            
            # ファミリー統計
            print(f"\n{gene_family}ファミリー解析結果:")
            print(f"- 総タンパク質数: {len(combined_df)}")
            print(f"- 生物種別分布:")
            print(combined_df["organism"].value_counts().to_string())
            
            return combined_df
        else:
            return pd.DataFrame()
    
    def functional_enrichment_analysis(self, protein_list: List[str]) -> Dict:
        """
        タンパク質リストの機能的濃縮解析
        
        Args:
            protein_list: UniProtアクセッションのリスト
        
        Returns:
            Dict: 濃縮解析結果
        """
        # タンパク質の詳細情報を取得
        proteins_data = []
        for accession in protein_list[:20]:  # 制限
            details = self.get_protein_details(accession, include_features=False)
            if details:
                proteins_data.append(details)
            time.sleep(self.rate_limit)
        
        if not proteins_data:
            return {}
        
        # GO term集計
        go_counts = {"molecular_function": {}, "biological_process": {}, "cellular_component": {}}
        
        for protein in proteins_data:
            go_annotations = protein.get("go_annotations", {})
            for category, terms in go_annotations.items():
                for term in terms:
                    term_id = term.get("term", "Unknown")
                    if term_id in go_counts[category]:
                        go_counts[category][term_id] += 1
                    else:
                        go_counts[category][term_id] = 1
        
        # パスウェイ集計
        pathway_counts = {}
        for protein in proteins_data:
            pathways = protein.get("pathways", [])
            for pathway in pathways:
                if pathway in pathway_counts:
                    pathway_counts[pathway] += 1
                else:
                    pathway_counts[pathway] = 1
        
        # 結果をランク順にソート
        enrichment_results = {
            "go_molecular_function": sorted(go_counts["molecular_function"].items(), 
                                         key=lambda x: x[1], reverse=True)[:10],
            "go_biological_process": sorted(go_counts["biological_process"].items(), 
                                          key=lambda x: x[1], reverse=True)[:10],
            "go_cellular_component": sorted(go_counts["cellular_component"].items(), 
                                          key=lambda x: x[1], reverse=True)[:10],
            "pathways": sorted(pathway_counts.items(), key=lambda x: x[1], reverse=True)[:10],
            "total_proteins": len(proteins_data)
        }
        
        return enrichment_results

# 使用例
if __name__ == "__main__":
    # UniProt解析ツール初期化
    uniprot = UniProtAnalyzer(rate_limit=1.0)
    
    # 1. BRCA1タンパク質の詳細解析
    print("=== BRCA1タンパク質詳細解析 ===")
    brca1_details = uniprot.get_protein_details("P38398", 
                                               include_features=True, 
                                               include_interactions=True)
    
    if brca1_details:
        print(f"タンパク質名: {brca1_details['protein_names']['recommended']}")
        print(f"遺伝子名: {brca1_details['gene_names']['primary']}")
        print(f"配列長: {brca1_details['sequence_info']['length']} aa")
        print(f"分子量: {brca1_details['sequence_info']['mass']} Da")
        print(f"相互作用数: {len(brca1_details.get('interactions', []))}")
    
    # 2. キナーゼファミリーの比較解析
    print("\n=== キナーゼファミリー比較解析 ===")
    kinase_family = uniprot.analyze_protein_family(
        gene_family="protein kinase",
        organisms=["human", "mouse"]
    )
    
    if not kinase_family.empty:
        print("\n上位10タンパク質:")
        print(kinase_family.head(10)[["Entry", "Gene Names", "Protein names", "organism"]].to_string())
    
    # 3. がん関連タンパク質の機能的濃縮解析
    print("\n=== がん関連タンパク質の機能的濃縮解析 ===")
    cancer_proteins = ["P53_HUMAN", "P38398", "P04637", "P21359", "Q02952"]  # 例
    
    # アクセッション形式に変換
    cancer_search = uniprot.search_proteins("cancer AND tumor suppressor", 
                                          organism="human", 
                                          reviewed=True, 
                                          max_results=20)
    
    if not cancer_search.empty:
        cancer_accessions = cancer_search["Entry"].tolist()[:10]
        enrichment = uniprot.functional_enrichment_analysis(cancer_accessions)
        
        print(f"解析対象タンパク質数: {enrichment.get('total_proteins', 0)}")
        print("\n上位GO Molecular Function:")
        for term, count in enrichment.get("go_molecular_function", [])[:5]:
            print(f"  {term}: {count}")
        
        print("\n上位GO Biological Process:")
        for term, count in enrichment.get("go_biological_process", [])[:5]:
            print(f"  {term}: {count}")