I.2 主要データベースの実践的活用法

I.2.1 NCBI データベース群の効率的利用

GenBank/RefSeq: 配列データの取得と品質管理

🧪 概念例(擬似コード)

from Bio import Entrez, SeqIO
import requests
import pandas as pd
from typing import List, Dict, Optional
import time
import logging

class NCBIDataRetriever:
    """NCBI データベースからの効率的なデータ取得"""
    
    def __init__(self, email: str, api_key: Optional[str] = None):
        """
        Args:
            email: NCBI利用規約に必要なメールアドレス
            api_key: API利用制限緩和のためのキー(推奨)
        """
        Entrez.email = email
        if api_key:
            Entrez.api_key = api_key
        
        # ログ設定
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # レート制限設定
        self.request_delay = 0.34 if api_key else 1.0  # API keyあり: 10 req/sec, なし: 3 req/sec
    
    def search_sequences(self, query: str, database: str = "nucleotide", 
                        max_results: int = 100, filters: Optional[Dict] = None) -> List[str]:
        """
        配列の検索とID取得
        
        Args:
            query: 検索クエリ(例:"BRCA1[Gene] AND human[Organism]")
            database: 検索対象データベース
            max_results: 取得する最大件数
            filters: フィルタ条件
        
        Returns:
            List[str]: GenBank Accession IDs
        """
        try:
            # フィルタ適用
            if filters:
                filter_terms = []
                if "organism" in filters:
                    filter_terms.append(f"{filters['organism']}[Organism]")
                if "molecular_type" in filters:
                    filter_terms.append(f"{filters['molecular_type']}[Properties]")
                if "date_range" in filters:
                    filter_terms.append(f"{filters['date_range']}[Publication Date]")
                
                if filter_terms:
                    query += " AND " + " AND ".join(filter_terms)
            
            self.logger.info(f"実行クエリ: {query}")
            
            # 検索実行
            handle = Entrez.esearch(db=database, term=query, retmax=max_results)
            search_results = Entrez.read(handle)
            handle.close()
            
            id_list = search_results["IdList"]
            self.logger.info(f"検索結果: {len(id_list)}件")
            
            return id_list
            
        except Exception as e:
            self.logger.error(f"検索エラー: {e}")
            raise
    
    def fetch_sequences_batch(self, id_list: List[str], database: str = "nucleotide",
                             batch_size: int = 100, format: str = "fasta") -> Dict[str, str]:
        """
        バッチ処理による配列データ取得
        
        Args:
            id_list: GenBank IDs
            database: データベース名
            batch_size: バッチサイズ
            format: 出力フォーマット
        
        Returns:
            Dict[str, str]: ID -> 配列データのマッピング
        """
        sequences = {}
        
        # バッチ処理
        for i in range(0, len(id_list), batch_size):
            batch_ids = id_list[i:i + batch_size]
            
            try:
                self.logger.info(f"バッチ {i//batch_size + 1}: {len(batch_ids)}件処理中")
                
                # データ取得
                handle = Entrez.efetch(
                    db=database,
                    id=",".join(batch_ids),
                    rettype=format,
                    retmode="text"
                )
                
                # FASTA形式の場合
                if format == "fasta":
                    fasta_records = SeqIO.parse(handle, "fasta")
                    for record in fasta_records:
                        sequences[record.id] = str(record.seq)
                else:
                    # 他の形式(GenBank等)
                    sequences.update({id: handle.read() for id in batch_ids})
                
                handle.close()
                
                # レート制限
                time.sleep(self.request_delay)
                
            except Exception as e:
                self.logger.error(f"バッチ処理エラー (batch {i//batch_size + 1}): {e}")
                continue
        
        return sequences
    
    def get_gene_summary(self, gene_symbol: str, organism: str = "human") -> Dict:
        """
        遺伝子情報の包括的取得
        
        Args:
            gene_symbol: 遺伝子シンボル(例:BRCA1)
            organism: 生物種
        
        Returns:
            Dict: 遺伝子情報
        """
        try:
            # Gene データベースで検索
            gene_query = f"{gene_symbol}[Gene] AND {organism}[Organism]"
            gene_handle = Entrez.esearch(db="gene", term=gene_query, retmax=1)
            gene_results = Entrez.read(gene_handle)
            gene_handle.close()
            
            if not gene_results["IdList"]:
                return {"error": f"遺伝子 {gene_symbol} が見つかりません"}
            
            gene_id = gene_results["IdList"][0]
            
            # 詳細情報取得
            summary_handle = Entrez.esummary(db="gene", id=gene_id)
            summary = Entrez.read(summary_handle)[0]
            summary_handle.close()
            
            # 関連配列情報取得
            link_handle = Entrez.elink(dbfrom="gene", db="nucleotide", id=gene_id)
            link_results = Entrez.read(link_handle)
            link_handle.close()
            
            nucleotide_ids = []
            if link_results[0]["LinkSetDb"]:
                nucleotide_ids = [link["Id"] for link in link_results[0]["LinkSetDb"][0]["Link"]]
            
            return {
                "gene_id": gene_id,
                "symbol": summary.get("Name", ""),
                "description": summary.get("Description", ""),
                "summary": summary.get("Summary", ""),
                "chromosome": summary.get("Chromosome", ""),
                "map_location": summary.get("MapLocation", ""),
                "gene_type": summary.get("GeneType", ""),
                "associated_sequences": len(nucleotide_ids),
                "nucleotide_ids": nucleotide_ids[:10]  # 最初の10件のみ
            }
            
        except Exception as e:
            self.logger.error(f"遺伝子情報取得エラー: {e}")
            return {"error": str(e)}

# 使用例
if __name__ == "__main__":
    # 初期化(実際の利用時はメールアドレスとAPIキーを設定)
    retriever = NCBIDataRetriever(
        email="your.email@example.com",
        api_key="your_api_key_here"  # オプション
    )
    
    # 1. BRCA1遺伝子の基本情報取得
    brca1_info = retriever.get_gene_summary("BRCA1", "human")
    print("BRCA1遺伝子情報:")
    print(f"- 説明: {brca1_info.get('description', 'N/A')}")
    print(f"- 染色体: {brca1_info.get('chromosome', 'N/A')}")
    print(f"- 関連配列数: {brca1_info.get('associated_sequences', 'N/A')}")
    
    # 2. COVID-19関連配列の検索と取得
    covid_filters = {
        "organism": "SARS-CoV-2",
        "molecular_type": "genomic RNA",
        "date_range": "2020/01/01:2024/12/31"
    }
    
    covid_ids = retriever.search_sequences(
        query="complete genome",
        database="nucleotide",
        max_results=50,
        filters=covid_filters
    )
    
    if covid_ids:
        # バッチで配列取得
        covid_sequences = retriever.fetch_sequences_batch(
            covid_ids[:10],  # テスト用に10件のみ
            batch_size=5
        )
        
        print(f"\n取得したCOVID-19配列: {len(covid_sequences)}件")
        for seq_id, sequence in list(covid_sequences.items())[:3]:
            print(f"- {seq_id}: {len(sequence)} bp")

SRA: シークエンシングデータの効率的アクセス

🧪 概念例(擬似コード)

import subprocess
import os
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET
import requests
from typing import List, Dict, Optional
import concurrent.futures
import hashlib

class SRADataManager:
    """SRA(Sequence Read Archive)データの効率的管理"""
    
    def __init__(self, work_dir: str = "./sra_data", max_workers: int = 4):
        """
        Args:
            work_dir: 作業ディレクトリ
            max_workers: 並列ダウンロード数
        """
        self.work_dir = Path(work_dir)
        self.work_dir.mkdir(exist_ok=True)
        self.max_workers = max_workers
        
        # SRA Toolkitの確認
        self._check_sra_toolkit()
    
    def _check_sra_toolkit(self):
        """SRA Toolkitのインストール確認"""
        try:
            result = subprocess.run(["fastq-dump", "--version"], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                print(f"SRA Toolkit確認済み: {result.stdout.strip()}")
            else:
                raise FileNotFoundError
        except FileNotFoundError:
            print("警告: SRA Toolkitが見つかりません")
            print("インストール方法: conda install -c bioconda sra-tools")
    
    def search_sra_studies(self, query: str, max_results: int = 100) -> pd.DataFrame:
        """
        SRAスタディの検索
        
        Args:
            query: 検索クエリ(例:"RNA-seq AND human AND cancer")
            max_results: 最大取得件数
        
        Returns:
            pd.DataFrame: 検索結果
        """
        try:
            # ESearchでSRAスタディを検索
            search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
            search_params = {
                "db": "sra",
                "term": query,
                "retmax": max_results,
                "retmode": "xml"
            }
            
            response = requests.get(search_url, params=search_params)
            root = ET.fromstring(response.content)
            
            # SRA IDリストを取得
            sra_ids = [id_elem.text for id_elem in root.findall(".//Id")]
            
            if not sra_ids:
                print("検索結果なし")
                return pd.DataFrame()
            
            # 詳細情報を取得
            summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            summary_params = {
                "db": "sra",
                "id": ",".join(sra_ids),
                "retmode": "xml"
            }
            
            response = requests.get(summary_url, params=summary_params)
            root = ET.fromstring(response.content)
            
            # 結果をパース
            studies = []
            for doc_sum in root.findall(".//DocSum"):
                study_info = {"SRA_ID": doc_sum.find("Id").text}
                
                for item in doc_sum.findall(".//Item"):
                    name = item.get("Name")
                    if name in ["Title", "Platform", "Organism", "LibraryStrategy", 
                               "LibrarySource", "SampleAccession", "StudyAccession"]:
                        study_info[name] = item.text or ""
                
                studies.append(study_info)
            
            df = pd.DataFrame(studies)
            print(f"検索結果: {len(df)}件のSRAスタディを取得")
            
            return df
            
        except Exception as e:
            print(f"SRA検索エラー: {e}")
            return pd.DataFrame()
    
    def get_run_info(self, study_accession: str) -> pd.DataFrame:
        """
        スタディに含まれるランの詳細情報を取得
        
        Args:
            study_accession: SRAスタディアクセッション(例:SRP123456)
        
        Returns:
            pd.DataFrame: ラン情報
        """
        try:
            # RunInfoを取得
            url = f"https://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={study_accession}"
            
            response = requests.get(url)
            if response.status_code == 200:
                # CSVデータをDataFrameに変換
                from io import StringIO
                df = pd.read_csv(StringIO(response.text))
                
                # 重要な列のみ選択
                important_cols = [
                    "Run", "SampleName", "Experiment", "LibraryStrategy",
                    "LibrarySource", "Platform", "Instrument", "InsertSize",
                    "LibraryLayout", "spots", "bases", "download_path"
                ]
                
                available_cols = [col for col in important_cols if col in df.columns]
                df_filtered = df[available_cols]
                
                print(f"ラン情報取得: {len(df_filtered)}件")
                return df_filtered
            else:
                print(f"RunInfo取得失敗: HTTP {response.status_code}")
                return pd.DataFrame()
                
        except Exception as e:
            print(f"RunInfo取得エラー: {e}")
            return pd.DataFrame()
    
    def download_fastq(self, run_accession: str, output_dir: Optional[str] = None,
                      paired: bool = True, compressed: bool = True) -> Dict[str, str]:
        """
        FASTQファイルのダウンロード
        
        Args:
            run_accession: ランアクセッション(例:SRR123456)
            output_dir: 出力ディレクトリ
            paired: ペアエンドデータの場合True
            compressed: 圧縮ファイルで保存する場合True
        
        Returns:
            Dict[str, str]: ダウンロードしたファイルのパス
        """
        if output_dir is None:
            output_dir = self.work_dir / "fastq"
        
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        try:
            # fastq-dumpコマンド構築
            cmd = ["fastq-dump"]
            
            if paired:
                cmd.extend(["--split-files"])  # ペアエンドファイルを分割
            
            if compressed:
                cmd.extend(["--gzip"])  # 圧縮して保存
            
            # 出力ディレクトリ指定
            cmd.extend(["--outdir", str(output_dir)])
            
            # ランアクセッション
            cmd.append(run_accession)
            
            print(f"FASTQダウンロード開始: {run_accession}")
            print(f"コマンド: {' '.join(cmd)}")
            
            # 実行
            result = subprocess.run(cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                # ダウンロードされたファイルを確認
                downloaded_files = {}
                suffix = ".fastq.gz" if compressed else ".fastq"
                
                if paired:
                    # ペアエンドファイル
                    for i in [1, 2]:
                        filename = f"{run_accession}_{i}{suffix}"
                        filepath = output_dir / filename
                        if filepath.exists():
                            downloaded_files[f"read_{i}"] = str(filepath)
                else:
                    # シングルエンドファイル
                    filename = f"{run_accession}{suffix}"
                    filepath = output_dir / filename
                    if filepath.exists():
                        downloaded_files["reads"] = str(filepath)
                
                print(f"ダウンロード完了: {len(downloaded_files)}ファイル")
                return downloaded_files
            else:
                print(f"ダウンロードエラー: {result.stderr}")
                return {}
                
        except Exception as e:
            print(f"FASTQダウンロードエラー: {e}")
            return {}
    
    def batch_download(self, run_list: List[str], max_concurrent: int = None) -> Dict[str, Dict]:
        """
        複数ランの並列ダウンロード
        
        Args:
            run_list: ランアクセッションのリスト
            max_concurrent: 最大並列数
        
        Returns:
            Dict[str, Dict]: ラン別ダウンロード結果
        """
        if max_concurrent is None:
            max_concurrent = self.max_workers
        
        results = {}
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:
            # 並列ダウンロード開始
            future_to_run = {
                executor.submit(self.download_fastq, run): run 
                for run in run_list
            }
            
            for future in concurrent.futures.as_completed(future_to_run):
                run = future_to_run[future]
                try:
                    result = future.result()
                    results[run] = result
                    print(f"完了: {run}")
                except Exception as e:
                    print(f"エラー {run}: {e}")
                    results[run] = {"error": str(e)}
        
        return results
    
    def verify_download_integrity(self, file_path: str, expected_md5: str = None) -> bool:
        """
        ダウンロードファイルの整合性確認
        
        Args:
            file_path: ファイルパス
            expected_md5: 期待されるMD5ハッシュ
        
        Returns:
            bool: 整合性確認結果
        """
        if not os.path.exists(file_path):
            return False
        
        if expected_md5:
            # MD5ハッシュ計算
            hasher = hashlib.md5()
            with open(file_path, 'rb') as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hasher.update(chunk)
            
            actual_md5 = hasher.hexdigest()
            return actual_md5 == expected_md5
        
        # MD5が提供されていない場合はファイル存在確認のみ
        return True

# 使用例
if __name__ == "__main__":
    # SRAデータマネージャー初期化
    sra_manager = SRADataManager(work_dir="./sra_analysis", max_workers=2)
    
    # 1. COVID-19関連RNA-seqデータの検索
    covid_studies = sra_manager.search_sra_studies(
        query="COVID-19 AND RNA-seq AND human",
        max_results=20
    )
    
    if not covid_studies.empty:
        print("\n検索結果(最初の5件):")
        print(covid_studies.head()[["StudyAccession", "Title", "Platform"]].to_string())
        
        # 2. 特定のスタディのラン情報取得
        first_study = covid_studies.iloc[0]["StudyAccession"]
        run_info = sra_manager.get_run_info(first_study)
        
        if not run_info.empty:
            print(f"\n{first_study}のラン情報:")
            print(run_info.head()[["Run", "LibraryStrategy", "spots", "bases"]].to_string())
            
            # 3. 小さなサンプルをダウンロード(テスト用)
            test_runs = run_info.head(2)["Run"].tolist()
            print(f"\nテストダウンロード開始: {test_runs}")
            
            # 個別ダウンロード
            for run in test_runs:
                result = sra_manager.download_fastq(run, compressed=True)
                if result:
                    print(f"ダウンロード成功 {run}: {list(result.keys())}")
                else:
                    print(f"ダウンロード失敗: {run}")

I.2.2 UniProt データベースの高度な活用

🧪 概念例(擬似コード)

import requests
import pandas as pd
import json
from typing import List, Dict, Optional, Union
import time
import re
from urllib.parse import urlencode
import xml.etree.ElementTree as ET

class UniProtAnalyzer:
    """UniProtタンパク質データベースの包括的解析ツール"""
    
    def __init__(self, rate_limit: float = 1.0):
        """
        Args:
            rate_limit: リクエスト間の待機時間(秒)
        """
        self.base_url = "https://rest.uniprot.org"
        self.rate_limit = rate_limit
        self.session = requests.Session()
        
        # よく使用されるフィールド定義
        self.common_fields = {
            "basic": [
                "accession", "id", "gene_names", "protein_name", 
                "organism_name", "length", "mass"
            ],
            "sequence": [
                "accession", "sequence", "length", "mass", 
                "cc_subcellular_location", "ft_domain"
            ],
            "function": [
                "accession", "protein_name", "cc_function", 
                "go_c", "go_f", "go_p", "cc_pathway"
            ],
            "disease": [
                "accession", "gene_names", "cc_disease", 
                "cc_involvement_in_disease", "cc_allergen", "cc_toxic_dose"
            ],
            "structure": [
                "accession", "ft_helix", "ft_strand", "ft_turn",
                "ft_disulfid", "xref_pdb", "cc_similarity"
            ]
        }
    
    def search_proteins(self, query: str, organism: Optional[str] = None,
                       reviewed: Optional[bool] = None, max_results: int = 100,
                       fields: List[str] = None) -> pd.DataFrame:
        """
        タンパク質の検索
        
        Args:
            query: 検索クエリ(Gene name, protein name, keywords等)
            organism: 生物種(例:"human", "mouse", "9606")
            reviewed: Swiss-Protのみ検索する場合True
            max_results: 最大取得件数
            fields: 取得するフィールドのリスト
        
        Returns:
            pd.DataFrame: 検索結果
        """
        # クエリ構築
        search_terms = [query]
        
        if organism:
            if organism.isdigit():
                search_terms.append(f"taxonomy_id:{organism}")
            else:
                search_terms.append(f"organism:{organism}")
        
        if reviewed is not None:
            search_terms.append("reviewed:true" if reviewed else "reviewed:false")
        
        final_query = " AND ".join(search_terms)
        
        # フィールド設定
        if fields is None:
            fields = self.common_fields["basic"]
        
        # APIリクエスト
        params = {
            "query": final_query,
            "format": "tsv",
            "fields": ",".join(fields),
            "size": min(max_results, 500)  # API制限
        }
        
        try:
            response = self.session.get(f"{self.base_url}/uniprotkb/search", params=params)
            response.raise_for_status()
            
            # TSVデータをDataFrameに変換
            from io import StringIO
            df = pd.read_csv(StringIO(response.text), sep='\t')
            
            print(f"検索結果: {len(df)}件のタンパク質")
            return df
            
        except requests.exceptions.RequestException as e:
            print(f"UniProt検索エラー: {e}")
            return pd.DataFrame()
        
        finally:
            time.sleep(self.rate_limit)
    
    def get_protein_details(self, accession: str, 
                          include_features: bool = True,
                          include_interactions: bool = True) -> Dict:
        """
        特定タンパク質の詳細情報取得
        
        Args:
            accession: UniProtアクセッション
            include_features: 特徴情報を含める場合True
            include_interactions: 相互作用情報を含める場合True
        
        Returns:
            Dict: タンパク質詳細情報
        """
        try:
            # 基本情報取得
            response = self.session.get(f"{self.base_url}/uniprotkb/{accession}")
            response.raise_for_status()
            
            protein_data = response.json()
            
            # 構造化された情報を抽出
            details = {
                "accession": accession,
                "entry_name": protein_data.get("uniProtkbId", ""),
                "protein_names": self._extract_protein_names(protein_data),
                "gene_names": self._extract_gene_names(protein_data),
                "organism": self._extract_organism(protein_data),
                "sequence_info": self._extract_sequence_info(protein_data),
                "subcellular_location": self._extract_subcellular_location(protein_data),
                "function": self._extract_function(protein_data),
                "go_annotations": self._extract_go_annotations(protein_data),
                "pathways": self._extract_pathways(protein_data),
                "diseases": self._extract_diseases(protein_data)
            }
            
            # オプション情報
            if include_features:
                details["features"] = self._extract_features(protein_data)
            
            if include_interactions:
                details["interactions"] = self.get_protein_interactions(accession)
            
            return details
            
        except requests.exceptions.RequestException as e:
            print(f"タンパク質詳細取得エラー: {e}")
            return {}
        
        finally:
            time.sleep(self.rate_limit)
    
    def _extract_protein_names(self, data: Dict) -> Dict:
        """タンパク質名の抽出"""
        names = {"recommended": "", "alternative": [], "short": []}
        
        if "proteinDescription" in data:
            desc = data["proteinDescription"]
            if "recommendedName" in desc:
                names["recommended"] = desc["recommendedName"].get("fullName", {}).get("value", "")
            
            if "alternativeNames" in desc:
                for alt in desc["alternativeNames"]:
                    if "fullName" in alt:
                        names["alternative"].append(alt["fullName"].get("value", ""))
                    if "shortNames" in alt:
                        names["short"].extend([sn.get("value", "") for sn in alt["shortNames"]])
        
        return names
    
    def _extract_gene_names(self, data: Dict) -> Dict:
        """遺伝子名の抽出"""
        genes = {"primary": "", "synonyms": [], "ordered_locus": [], "orf": []}
        
        if "genes" in data:
            for gene in data["genes"]:
                if gene.get("geneName"):
                    genes["primary"] = gene["geneName"].get("value", "")
                
                if "synonyms" in gene:
                    genes["synonyms"].extend([syn.get("value", "") for syn in gene["synonyms"]])
                
                if "orderedLocusNames" in gene:
                    genes["ordered_locus"].extend([oln.get("value", "") for oln in gene["orderedLocusNames"]])
                
                if "orfNames" in gene:
                    genes["orf"].extend([orf.get("value", "") for orf in gene["orfNames"]])
        
        return genes
    
    def _extract_organism(self, data: Dict) -> Dict:
        """生物種情報の抽出"""
        organism = {"scientific_name": "", "common_name": "", "taxonomy_id": 0}
        
        if "organism" in data:
            org = data["organism"]
            organism["scientific_name"] = org.get("scientificName", "")
            organism["common_name"] = org.get("commonName", "")
            organism["taxonomy_id"] = org.get("taxonId", 0)
        
        return organism
    
    def _extract_sequence_info(self, data: Dict) -> Dict:
        """配列情報の抽出"""
        seq_info = {"length": 0, "mass": 0, "checksum": "", "sequence": ""}
        
        if "sequence" in data:
            seq = data["sequence"]
            seq_info["length"] = seq.get("length", 0)
            seq_info["mass"] = seq.get("molWeight", 0)
            seq_info["checksum"] = seq.get("crc64", "")
            seq_info["sequence"] = seq.get("value", "")
        
        return seq_info
    
    def _extract_go_annotations(self, data: Dict) -> Dict:
        """Gene Ontologyアノテーションの抽出"""
        go_terms = {"molecular_function": [], "biological_process": [], "cellular_component": []}
        
        if "uniProtKBCrossReferences" in data:
            for ref in data["uniProtKBCrossReferences"]:
                if ref.get("database") == "GO":
                    go_id = ref.get("id", "")
                    go_desc = ""
                    go_aspect = ""
                    
                    if "properties" in ref:
                        for prop in ref["properties"]:
                            if prop.get("key") == "GoTerm":
                                go_desc = prop.get("value", "").split(":")[1] if ":" in prop.get("value", "") else ""
                            elif prop.get("key") == "GoEvidenceType":
                                go_aspect = prop.get("value", "")
                    
                    go_entry = {"id": go_id, "term": go_desc, "evidence": go_aspect}
                    
                    # GO aspectに基づく分類
                    if go_id.startswith("GO:"):
                        if "F:" in go_desc:
                            go_terms["molecular_function"].append(go_entry)
                        elif "P:" in go_desc:
                            go_terms["biological_process"].append(go_entry)
                        elif "C:" in go_desc:
                            go_terms["cellular_component"].append(go_entry)
        
        return go_terms
    
    def get_protein_interactions(self, accession: str, max_interactions: int = 50) -> List[Dict]:
        """
        タンパク質相互作用情報の取得
        
        Args:
            accession: UniProtアクセッション
            max_interactions: 最大取得相互作用数
        
        Returns:
            List[Dict]: 相互作用情報
        """
        try:
            # STRING データベースAPIを使用
            string_url = "https://string-db.org/api/json/network"
            params = {
                "identifiers": accession,
                "species": 9606,  # human
                "limit": max_interactions
            }
            
            response = self.session.get(string_url, params=params)
            response.raise_for_status()
            
            interactions = []
            for interaction in response.json():
                interactions.append({
                    "partner_a": interaction.get("preferredName_A", ""),
                    "partner_b": interaction.get("preferredName_B", ""),
                    "score": interaction.get("score", 0),
                    "interaction_type": "protein-protein"
                })
            
            return interactions
            
        except requests.exceptions.RequestException as e:
            print(f"相互作用情報取得エラー: {e}")
            return []
        
        finally:
            time.sleep(self.rate_limit)
    
    def analyze_protein_family(self, gene_family: str, organisms: List[str] = None) -> pd.DataFrame:
        """
        タンパク質ファミリーの比較解析
        
        Args:
            gene_family: 遺伝子ファミリー名(例:"histone", "kinase")
            organisms: 対象生物種のリスト
        
        Returns:
            pd.DataFrame: ファミリー解析結果
        """
        if organisms is None:
            organisms = ["human", "mouse", "rat", "zebrafish"]
        
        family_data = []
        
        for organism in organisms:
            print(f"{organism}{gene_family}ファミリーを検索中...")
            
            # ファミリー検索
            results = self.search_proteins(
                query=gene_family,
                organism=organism,
                reviewed=True,
                max_results=100,
                fields=self.common_fields["basic"] + ["cc_function", "go_f"]
            )
            
            if not results.empty:
                results["organism"] = organism
                results["family"] = gene_family
                family_data.append(results)
            
            time.sleep(self.rate_limit)
        
        if family_data:
            combined_df = pd.concat(family_data, ignore_index=True)
            
            # ファミリー統計
            print(f"\n{gene_family}ファミリー解析結果:")
            print(f"- 総タンパク質数: {len(combined_df)}")
            print(f"- 生物種別分布:")
            print(combined_df["organism"].value_counts().to_string())
            
            return combined_df
        else:
            return pd.DataFrame()
    
    def functional_enrichment_analysis(self, protein_list: List[str]) -> Dict:
        """
        タンパク質リストの機能的濃縮解析
        
        Args:
            protein_list: UniProtアクセッションのリスト
        
        Returns:
            Dict: 濃縮解析結果
        """
        # タンパク質の詳細情報を取得
        proteins_data = []
        for accession in protein_list[:20]:  # 制限
            details = self.get_protein_details(accession, include_features=False)
            if details:
                proteins_data.append(details)
            time.sleep(self.rate_limit)
        
        if not proteins_data:
            return {}
        
        # GO term集計
        go_counts = {"molecular_function": {}, "biological_process": {}, "cellular_component": {}}
        
        for protein in proteins_data:
            go_annotations = protein.get("go_annotations", {})
            for category, terms in go_annotations.items():
                for term in terms:
                    term_id = term.get("term", "Unknown")
                    if term_id in go_counts[category]:
                        go_counts[category][term_id] += 1
                    else:
                        go_counts[category][term_id] = 1
        
        # パスウェイ集計
        pathway_counts = {}
        for protein in proteins_data:
            pathways = protein.get("pathways", [])
            for pathway in pathways:
                if pathway in pathway_counts:
                    pathway_counts[pathway] += 1
                else:
                    pathway_counts[pathway] = 1
        
        # 結果をランク順にソート
        enrichment_results = {
            "go_molecular_function": sorted(go_counts["molecular_function"].items(), 
                                         key=lambda x: x[1], reverse=True)[:10],
            "go_biological_process": sorted(go_counts["biological_process"].items(), 
                                          key=lambda x: x[1], reverse=True)[:10],
            "go_cellular_component": sorted(go_counts["cellular_component"].items(), 
                                          key=lambda x: x[1], reverse=True)[:10],
            "pathways": sorted(pathway_counts.items(), key=lambda x: x[1], reverse=True)[:10],
            "total_proteins": len(proteins_data)
        }
        
        return enrichment_results

# 使用例
if __name__ == "__main__":
    # UniProt解析ツール初期化
    uniprot = UniProtAnalyzer(rate_limit=1.0)
    
    # 1. BRCA1タンパク質の詳細解析
    print("=== BRCA1タンパク質詳細解析 ===")
    brca1_details = uniprot.get_protein_details("P38398", 
                                               include_features=True, 
                                               include_interactions=True)
    
    if brca1_details:
        print(f"タンパク質名: {brca1_details['protein_names']['recommended']}")
        print(f"遺伝子名: {brca1_details['gene_names']['primary']}")
        print(f"配列長: {brca1_details['sequence_info']['length']} aa")
        print(f"分子量: {brca1_details['sequence_info']['mass']} Da")
        print(f"相互作用数: {len(brca1_details.get('interactions', []))}")
    
    # 2. キナーゼファミリーの比較解析
    print("\n=== キナーゼファミリー比較解析 ===")
    kinase_family = uniprot.analyze_protein_family(
        gene_family="protein kinase",
        organisms=["human", "mouse"]
    )
    
    if not kinase_family.empty:
        print("\n上位10タンパク質:")
        print(kinase_family.head(10)[["Entry", "Gene Names", "Protein names", "organism"]].to_string())
    
    # 3. がん関連タンパク質の機能的濃縮解析
    print("\n=== がん関連タンパク質の機能的濃縮解析 ===")
    cancer_proteins = ["P53_HUMAN", "P38398", "P04637", "P21359", "Q02952"]  # 例
    
    # アクセッション形式に変換
    cancer_search = uniprot.search_proteins("cancer AND tumor suppressor", 
                                          organism="human", 
                                          reviewed=True, 
                                          max_results=20)
    
    if not cancer_search.empty:
        cancer_accessions = cancer_search["Entry"].tolist()[:10]
        enrichment = uniprot.functional_enrichment_analysis(cancer_accessions)
        
        print(f"解析対象タンパク質数: {enrichment.get('total_proteins', 0)}")
        print("\n上位GO Molecular Function:")
        for term, count in enrichment.get("go_molecular_function", [])[:5]:
            print(f"  {term}: {count}件")
        
        print("\n上位GO Biological Process:")
        for term, count in enrichment.get("go_biological_process", [])[:5]:
            print(f"  {term}: {count}件")