Spaces:

gaaahee
/

news-stance-detection

Sleeping

ZedwrKc commited on 14 days ago

Commit

bb78fbc

1 Parent(s): 2b2a035

Add stance analysis integration

- Integrate KoBERT stance classifier (gaaahee/political-news-stance-classifier)
- Update /batch-process-articles to include stance analysis
- Add huggingface-hub dependency
- Support 3-in-1 pipeline: Summary + Embedding + Stance

Files changed (4) hide show

requirements.txt +1 -0
src/api/main.py +47 -7
src/models/stance_classifier.py +250 -0
test_stance_integration.py +173 -0

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ sentencepiece==0.2.0
 sentence-transformers==3.3.1
 python-dotenv==1.0.1
 keybert==0.8.5
 # BERTopic Clustering & Visualization
 bertopic==0.17.3

 sentence-transformers==3.3.1
 python-dotenv==1.0.1
 keybert==0.8.5
+huggingface-hub>=0.20.0
 # BERTopic Clustering & Visualization
 bertopic==0.17.3

src/api/main.py CHANGED Viewed

@@ -21,6 +21,7 @@ from src.api.schemas import (
 )
 from src.models.summarizer import KoBARTSummarizer
 from src.models.embedding import KoSentenceEmbedder
 from src.utils.config import settings
 from src.utils.logger import setup_logger
 from src.utils.validation import validate_models_loaded, validate_batch_size
@@ -31,6 +32,7 @@ logger = setup_logger()
 # Global model instances
 summarizer: KoBARTSummarizer = None
 embedder: KoSentenceEmbedder = None
 @asynccontextmanager
@@ -39,12 +41,13 @@ async def lifespan(app: FastAPI):
     Application lifespan context manager
     Load models on startup, cleanup on shutdown
     """
-    global summarizer, embedder
     # Startup: Load models
     logger.info("Starting AI Processing Service...")
     logger.info(f"Summarization Model: {settings.MODEL_NAME}")
     logger.info(f"Embedding Model: jhgan/ko-sroberta-multitask")
     logger.info(f"Max batch size: {settings.MAX_BATCH_SIZE}")
     try:
@@ -56,6 +59,12 @@ async def lifespan(app: FastAPI):
         embedder = KoSentenceEmbedder()
         logger.info("✓ Embedding model loaded successfully (768-dim)")
         logger.info("All models ready!")
     except Exception as e:
         logger.error(f"Failed to load models: {e}")
@@ -93,7 +102,7 @@ async def root():
         "version": "1.0.0",
         "endpoints": {
             "health": "/health",
-            "process": "/batch-process-articles (summary + embedding)",
             "summarize": "/batch-summarize (legacy)",
             "cluster": "/cluster-topics (BERTopic clustering - CustomTokenizer)",
             "cluster_mecab": "/cluster-topics-mecab (BERTopic clustering - Mecab)",
@@ -117,7 +126,7 @@ async def health_check():
         status="healthy",
         summarization_model=summarizer.model_name,
         embedding_model=embedder.model_name,
-        stance_model=None,  # Not yet implemented
         device=summarizer.device
     )
@@ -199,13 +208,13 @@ async def batch_process_articles(request: BatchProcessRequest):
     Processing Pipeline:
     1. Content → Summary (KoBART)
     2. Title + Summary → Embedding (ko-sroberta-multitask, 768-dim) ⭐
-    3. Summary → Stance (optional, not yet implemented)
     Args:
         request: BatchProcessRequest with list of articles
     Returns:
-        BatchProcessResponse with summaries, embeddings, and optional stance results
     Raises:
         HTTPException: If models not loaded or batch size exceeded
@@ -301,8 +310,39 @@ async def batch_process_articles(request: BatchProcessRequest):
                 logger.error(f"Batch embedding failed: {e}")
                 # Embeddings will remain None for failed articles
-        # Step 3: Stance analysis (TODO: implement when model ready)
-        # For now, stance remains None
         # Calculate statistics
         successful = sum(1 for r in results if r.error is None)

 )
 from src.models.summarizer import KoBARTSummarizer
 from src.models.embedding import KoSentenceEmbedder
+from src.models.stance_classifier import KoBERTStanceAnalyzer
 from src.utils.config import settings
 from src.utils.logger import setup_logger
 from src.utils.validation import validate_models_loaded, validate_batch_size
 # Global model instances
 summarizer: KoBARTSummarizer = None
 embedder: KoSentenceEmbedder = None
+stance_analyzer: KoBERTStanceAnalyzer = None
 @asynccontextmanager
     Application lifespan context manager
     Load models on startup, cleanup on shutdown
     """
+    global summarizer, embedder, stance_analyzer
     # Startup: Load models
     logger.info("Starting AI Processing Service...")
     logger.info(f"Summarization Model: {settings.MODEL_NAME}")
     logger.info(f"Embedding Model: jhgan/ko-sroberta-multitask")
+    logger.info(f"Stance Model: gaaahee/political-news-stance-classifier")
     logger.info(f"Max batch size: {settings.MAX_BATCH_SIZE}")
     try:
         embedder = KoSentenceEmbedder()
         logger.info("✓ Embedding model loaded successfully (768-dim)")
+        # Load stance analysis model
+        stance_analyzer = KoBERTStanceAnalyzer(
+            repo_id="gaaahee/political-news-stance-classifier"
+        )
+        logger.info("✓ Stance analysis model loaded successfully")
         logger.info("All models ready!")
     except Exception as e:
         logger.error(f"Failed to load models: {e}")
         "version": "1.0.0",
         "endpoints": {
             "health": "/health",
+            "process": "/batch-process-articles (summary + embedding + stance)",
             "summarize": "/batch-summarize (legacy)",
             "cluster": "/cluster-topics (BERTopic clustering - CustomTokenizer)",
             "cluster_mecab": "/cluster-topics-mecab (BERTopic clustering - Mecab)",
         status="healthy",
         summarization_model=summarizer.model_name,
         embedding_model=embedder.model_name,
+        stance_model=stance_analyzer.model_name if stance_analyzer else None,
         device=summarizer.device
     )
     Processing Pipeline:
     1. Content → Summary (KoBART)
     2. Title + Summary → Embedding (ko-sroberta-multitask, 768-dim) ⭐
+    3. Summary → Stance (KoBERT fine-tuned, support/neutral/oppose) ⭐
     Args:
         request: BatchProcessRequest with list of articles
     Returns:
+        BatchProcessResponse with summaries, embeddings, and stance results
     Raises:
         HTTPException: If models not loaded or batch size exceeded
                 logger.error(f"Batch embedding failed: {e}")
                 # Embeddings will remain None for failed articles
+        # Step 3: Stance analysis (KoBERT fine-tuned model)
+        if stance_analyzer:
+            try:
+                # Collect summaries for stance analysis (only successful summaries)
+                summaries_for_stance = []
+                stance_indices = []
+                for idx, result in enumerate(results):
+                    if result.summary and result.error is None:
+                        summaries_for_stance.append(result.summary)
+                        stance_indices.append(idx)
+                if summaries_for_stance:
+                    logger.info(f"Analyzing stance for {len(summaries_for_stance)} summaries")
+                    # Batch stance analysis
+                    stance_results = stance_analyzer.analyze_batch(
+                        summaries_for_stance,
+                        batch_size=16
+                    )
+                    # Map stance results back to results
+                    for idx, stance_result in zip(stance_indices, stance_results):
+                        from src.api.schemas import StanceResult
+                        results[idx].stance = StanceResult(**stance_result)
+                    logger.info(f"✓ Stance analysis completed for {len(stance_results)} articles")
+            except Exception as e:
+                logger.error(f"Stance analysis failed: {e}")
+                # Stance will remain None for failed articles
+        else:
+            logger.warning("Stance analyzer not available, skipping stance analysis")
         # Calculate statistics
         successful = sum(1 for r in results if r.error is None)

src/models/stance_classifier.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+KoBERT-based Stance Classifier for Korean Political News
+Loads fine-tuned stance classification model from HuggingFace Hub.
+Model classifies news text into 3 stances: support/neutral/oppose
+"""
+import torch
+import torch.nn as nn
+from transformers import BertModel, AutoTokenizer
+from huggingface_hub import hf_hub_download
+import logging
+from typing import List, Dict, Optional
+import json
+logger = logging.getLogger(__name__)
+class StanceClassifier(nn.Module):
+    """KoBERT-based stance classification model"""
+    def __init__(self, n_classes=3, dropout=0.3, model_name="skt/kobert-base-v1"):
+        super(StanceClassifier, self).__init__()
+        self.bert = BertModel.from_pretrained(model_name)
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        pooled_output = outputs.pooler_output
+        pooled_output = self.dropout(pooled_output)
+        return self.classifier(pooled_output)
+class KoBERTStanceAnalyzer:
+    """
+    KoBERT-based stance analyzer for Korean political news
+    Loads model from HuggingFace Hub (gaaahee/political-news-stance-classifier)
+    and performs stance classification on article summaries.
+    """
+    def __init__(
+        self,
+        repo_id: str = "gaaahee/political-news-stance-classifier",
+        device: Optional[str] = None
+    ):
+        """
+        Initialize stance analyzer
+        Args:
+            repo_id: HuggingFace Hub repository ID
+            device: Device to run model on (cpu/cuda). Auto-detects if None.
+        """
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.repo_id = repo_id
+        self.label_names = ["옹호", "중립", "비판"]
+        self.label_names_en = ["support", "neutral", "oppose"]
+        # Model metadata (will be loaded from config.json)
+        self.model_name = "KoBERT Stance Classifier"
+        self.base_model = "skt/kobert-base-v1"
+        self.tokenizer_name = "monologg/kobert"
+        self.num_labels = 3
+        self.max_length = 512
+        self.dropout = 0.3
+        # Load model components from HF Hub
+        self._load_model()
+        logger.info(f"✓ Stance model loaded from {repo_id} on {self.device}")
+    def _load_model(self):
+        """Load tokenizer and model from HuggingFace Hub"""
+        try:
+            # Load config from HF Hub
+            logger.info(f"Downloading config from {self.repo_id}")
+            config_path = hf_hub_download(self.repo_id, "config.json")
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+            # Update model metadata from config
+            self.base_model = config.get("base_model", "skt/kobert-base-v1")
+            self.tokenizer_name = config.get("tokenizer", "monologg/kobert")
+            self.num_labels = config.get("num_labels", 3)
+            self.max_length = config.get("max_length", 512)
+            self.dropout = config.get("dropout", 0.3)
+            # Load tokenizer (must use monologg/kobert for compatibility)
+            logger.info(f"Loading tokenizer: {self.tokenizer_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.tokenizer_name,
+                trust_remote_code=True
+            )
+            # Initialize model architecture
+            logger.info(f"Initializing model architecture: {self.base_model}")
+            self.model = StanceClassifier(
+                n_classes=self.num_labels,
+                dropout=self.dropout,
+                model_name=self.base_model
+            )
+            # Download and load fine-tuned weights from HF Hub
+            logger.info(f"Downloading model weights from {self.repo_id}")
+            model_path = hf_hub_download(self.repo_id, "model.pth")
+            state_dict = torch.load(model_path, map_location=self.device)
+            self.model.load_state_dict(state_dict)
+            # Move to device and set to eval mode
+            self.model.to(self.device)
+            self.model.eval()
+            logger.info(f"✓ Model loaded successfully (Test Accuracy: {config.get('test_accuracy', 'N/A')})")
+        except Exception as e:
+            logger.error(f"Failed to load stance model from HF Hub: {e}")
+            raise
+    def predict_single(self, text: str) -> Dict:
+        """
+        Predict stance for a single text
+        Args:
+            text: Article summary to analyze
+        Returns:
+            Dict with stance, confidence, and probabilities
+        """
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            max_length=self.max_length,
+            truncation=True,
+            padding="max_length"
+        )
+        input_ids = inputs["input_ids"].to(self.device)
+        attention_mask = inputs["attention_mask"].to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+            probs = torch.softmax(outputs, dim=1)[0]
+            pred = torch.argmax(probs).item()
+        return {
+            "stance": self.label_names_en[pred],
+            "stance_kr": self.label_names[pred],
+            "confidence": round(probs[pred].item(), 4),
+            "probabilities": {
+                "support": round(probs[0].item(), 4),
+                "neutral": round(probs[1].item(), 4),
+                "oppose": round(probs[2].item(), 4)
+            }
+        }
+    def predict_batch(self, texts: List[str], batch_size: int = 16) -> List[Dict]:
+        """
+        Predict stance for multiple texts in batches
+        Args:
+            texts: List of article summaries to analyze
+            batch_size: Batch size for processing
+        Returns:
+            List of stance prediction results
+        """
+        results = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            inputs = self.tokenizer(
+                batch,
+                return_tensors="pt",
+                max_length=self.max_length,
+                truncation=True,
+                padding="max_length"
+            )
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            with torch.no_grad():
+                outputs = self.model(input_ids, attention_mask)
+                probs = torch.softmax(outputs, dim=1)
+            for j in range(len(batch)):
+                pred = torch.argmax(probs[j]).item()
+                results.append({
+                    "stance": self.label_names_en[pred],
+                    "stance_kr": self.label_names[pred],
+                    "confidence": round(probs[j][pred].item(), 4),
+                    "probabilities": {
+                        "support": round(probs[j][0].item(), 4),
+                        "neutral": round(probs[j][1].item(), 4),
+                        "oppose": round(probs[j][2].item(), 4)
+                    }
+                })
+        return results
+    def analyze_stance(self, summary: str) -> Dict:
+        """
+        Analyze stance from article summary
+        Args:
+            summary: Article summary text
+        Returns:
+            Dict compatible with StanceResult schema:
+            {
+                "stance_label": "support" | "neutral" | "oppose",
+                "prob_positive": float,  # support probability
+                "prob_neutral": float,
+                "prob_negative": float,  # oppose probability
+                "stance_score": float    # prob_positive - prob_negative
+            }
+        """
+        result = self.predict_single(summary)
+        probs = result["probabilities"]
+        return {
+            "stance_label": result["stance"],
+            "prob_positive": probs["support"],
+            "prob_neutral": probs["neutral"],
+            "prob_negative": probs["oppose"],
+            "stance_score": probs["support"] - probs["oppose"]
+        }
+    def analyze_batch(self, summaries: List[str], batch_size: int = 16) -> List[Dict]:
+        """
+        Analyze stance for multiple summaries
+        Args:
+            summaries: List of article summaries
+            batch_size: Batch size for processing
+        Returns:
+            List of stance results compatible with StanceResult schema
+        """
+        results = self.predict_batch(summaries, batch_size=batch_size)
+        return [
+            {
+                "stance_label": r["stance"],
+                "prob_positive": r["probabilities"]["support"],
+                "prob_neutral": r["probabilities"]["neutral"],
+                "prob_negative": r["probabilities"]["oppose"],
+                "stance_score": r["probabilities"]["support"] - r["probabilities"]["oppose"]
+            }
+            for r in results
+        ]

test_stance_integration.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Test script for stance analysis integration
+Tests the complete pipeline:
+1. Summary generation
+2. Embedding generation
+3. Stance analysis
+"""
+import requests
+import json
+def test_batch_process_with_stance():
+    """Test /batch-process-articles endpoint with stance analysis"""
+    # Test data: Korean political news articles
+    test_articles = [
+        {
+            "article_id": 1,
+            "title": "정부 부동산 규제 완화",
+            "content": "정부가 오늘 부동산 규제 완화 방안을 발표했다. 이번 조치로 주택 구매가 더 쉬워질 전망이다. "
+                      "전문가들은 이번 정책이 경제 활성화에 도움이 될 것으로 기대하고 있다."
+        },
+        {
+            "article_id": 2,
+            "title": "야당 정부 정책 비판",
+            "content": "야당은 오늘 정부의 정책에 대해 강하게 비판했다. 야당 대표는 이번 정책이 서민들에게 "
+                      "도움이 되지 않는다고 주장했다. 야당은 정부가 재검토해야 한다고 촉구했다."
+        },
+        {
+            "article_id": 3,
+            "title": "국회 법안 심의",
+            "content": "국회에서 오늘 법안 심의가 진행되었다. 여야 의원들이 참석한 가운데 다양한 의견이 "
+                      "제시되었다. 법안은 다음 주 본회의에 상정될 예정이다."
+        }
+    ]
+    # API endpoint
+    url = "http://localhost:7860/batch-process-articles"
+    # Request payload
+    payload = {
+        "articles": test_articles,
+        "max_summary_length": 300,
+        "min_summary_length": 150
+    }
+    print("Testing /batch-process-articles endpoint...")
+    print(f"Sending {len(test_articles)} articles\n")
+    try:
+        # Send request
+        response = requests.post(url, json=payload, timeout=120)
+        response.raise_for_status()
+        # Parse response
+        result = response.json()
+        print("=" * 80)
+        print("RESPONSE SUMMARY")
+        print("=" * 80)
+        print(f"Total processed: {result['total_processed']}")
+        print(f"Successful: {result['successful']}")
+        print(f"Failed: {result['failed']}")
+        print(f"Processing time: {result['processing_time_seconds']:.2f}s")
+        print()
+        # Display results
+        for i, article_result in enumerate(result['results'], 1):
+            print("=" * 80)
+            print(f"ARTICLE {i}: {test_articles[i-1]['title']}")
+            print("=" * 80)
+            # Original content
+            print(f"\nOriginal (first 100 chars):")
+            print(f"  {test_articles[i-1]['content'][:100]}...")
+            # Summary
+            if article_result['summary']:
+                print(f"\nSummary:")
+                print(f"  {article_result['summary']}")
+            else:
+                print(f"\nSummary: FAILED - {article_result.get('error')}")
+            # Embedding
+            if article_result['embedding']:
+                print(f"\nEmbedding:")
+                print(f"  Dimension: {len(article_result['embedding'])}")
+                print(f"  First 5 values: {article_result['embedding'][:5]}")
+            else:
+                print(f"\nEmbedding: Not generated")
+            # Stance
+            if article_result['stance']:
+                stance = article_result['stance']
+                print(f"\nStance Analysis:")
+                print(f"  Label: {stance['stance_label'].upper()}")
+                print(f"  Score: {stance['stance_score']:.4f}")
+                print(f"  Probabilities:")
+                print(f"    - Support (옹호): {stance['prob_positive']:.4f}")
+                print(f"    - Neutral (중립): {stance['prob_neutral']:.4f}")
+                print(f"    - Oppose (비판):  {stance['prob_negative']:.4f}")
+            else:
+                print(f"\nStance: Not analyzed")
+            print()
+        print("=" * 80)
+        print("TEST COMPLETED SUCCESSFULLY")
+        print("=" * 80)
+    except requests.exceptions.RequestException as e:
+        print(f"ERROR: Request failed")
+        print(f"  {e}")
+        if hasattr(e.response, 'text'):
+            print(f"  Response: {e.response.text}")
+    except Exception as e:
+        print(f"ERROR: {e}")
+def test_health_check():
+    """Test /health endpoint to verify all models are loaded"""
+    url = "http://localhost:7860/health"
+    print("Testing /health endpoint...")
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        result = response.json()
+        print("\n" + "=" * 80)
+        print("HEALTH CHECK")
+        print("=" * 80)
+        print(f"Status: {result['status']}")
+        print(f"Device: {result['device']}")
+        print(f"\nModels loaded:")
+        print(f"  - Summarization: {result['summarization_model']}")
+        print(f"  - Embedding: {result['embedding_model']}")
+        print(f"  - Stance: {result['stance_model']}")
+        print("=" * 80)
+        print()
+        if result['stance_model'] is None:
+            print("WARNING: Stance model not loaded!")
+            return False
+        return True
+    except Exception as e:
+        print(f"ERROR: Health check failed - {e}")
+        return False
+if __name__ == "__main__":
+    print("\n" + "=" * 80)
+    print("STANCE ANALYSIS INTEGRATION TEST")
+    print("=" * 80)
+    print()
+    # Step 1: Health check
+    if test_health_check():
+        print("\n✓ Health check passed\n")
+        # Step 2: Test batch processing with stance
+        test_batch_process_with_stance()
+    else:
+        print("\n✗ Health check failed - skipping batch test")
+        print("\nMake sure the API server is running:")
+        print("  python app.py")