maslionok commited on
Commit
eff2c39
·
1 Parent(s): 7c3940e

added some explanations to the output

Browse files
Files changed (1) hide show
  1. app.py +120 -8
app.py CHANGED
@@ -8,6 +8,11 @@ st.set_page_config(page_title="Impresso Ad Classifier", layout="centered")
8
 
9
  st.title("📰 Impresso Ad Classifier")
10
  st.markdown("Enter text below to classify it as an Advertisement or Non-Advertisement using the `ad_model_pipeline`.")
 
 
 
 
 
11
 
12
  # --- LOAD PIPELINE ---
13
  @st.cache_resource
@@ -23,7 +28,15 @@ except Exception as e:
23
  st.stop()
24
 
25
  # --- USER INTERFACE ---
26
- text_input = st.text_area("Input Text", height=200, placeholder="Paste historical text here...")
 
 
 
 
 
 
 
 
27
 
28
  if st.button("Classify", type="primary"):
29
  if text_input.strip():
@@ -45,16 +58,115 @@ if st.button("Classify", type="primary"):
45
  # 1. Visual Header
46
  result_type = main_result.get('type', 'unknown')
47
  if result_type == 'ad':
48
- st.success(f"### Result: ADVERTISEMENT")
49
  else:
50
- st.info(f"### Result: NON-ADVERTISEMENT")
51
 
52
- # 2. Diagnostics Data
53
- st.subheader("Full Diagnostics")
54
- st.markdown("Below are the classification details, probabilities, and rule scores:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Display as pretty JSON
57
- st.json(main_result)
 
58
 
59
  except Exception as e:
60
  st.error(f"Error during processing: {e}")
 
8
 
9
  st.title("📰 Impresso Ad Classifier")
10
  st.markdown("Enter text below to classify it as an Advertisement or Non-Advertisement using the `ad_model_pipeline`.")
11
+ st.info("""
12
+ **Now supports German and French!**
13
+
14
+ You can classify texts in English, German, or French. The classifier automatically adapts its thresholds and rules for these languages.
15
+ """)
16
 
17
  # --- LOAD PIPELINE ---
18
  @st.cache_resource
 
28
  st.stop()
29
 
30
  # --- USER INTERFACE ---
31
+ EXAMPLE_TEXT = (
32
+ "Nouveaux exploits des pilotes suisses Le record suisse de vol avee but fixé et retour au point de départ pour planeurs biplaces, détenu par Walter Meierhofer et Rosemarie Meierhofer avec 220 kilomètres, a été battu deux fois lundi. Partant de Diillikon, l'équipe HuberLiischer a en effet réussi un vol jusqn'aux Ponts-de-Martel et retour, soit nne distance de 278 km., tandis que Schàrli-Hodel atteignaient 261 km. qu'à La Chaux-de-Fonds et retour. D'autre part, à Birrfeld, Fritz _Dubg s. obtenu une distinction internationale ( Insigne or avec diamant)'pour avoir réalisé un gain d'altitude de 4000 mètres."
33
+ )
34
+ text_input = st.text_area(
35
+ "Input Text",
36
+ value=EXAMPLE_TEXT,
37
+ height=200,
38
+ placeholder="Paste historical text here..."
39
+ )
40
 
41
  if st.button("Classify", type="primary"):
42
  if text_input.strip():
 
58
  # 1. Visual Header
59
  result_type = main_result.get('type', 'unknown')
60
  if result_type == 'ad':
61
+ st.success(f"### Result: ADVERTISEMENT")
62
  else:
63
+ st.info(f"### ℹ️ Result: NON-ADVERTISEMENT")
64
 
65
+ # 2. Key Metrics
66
+ st.subheader("📊 Classification Metrics")
67
+
68
+ col1, col2 = st.columns(2)
69
+ with col1:
70
+ st.metric(
71
+ "Final Probability",
72
+ f"{main_result.get('promotion_prob_final', 0):.2f}",
73
+ help="The final probability that this text is an advertisement (after all adjustments)"
74
+ )
75
+ st.metric(
76
+ "Model Confidence",
77
+ f"{main_result.get('model_confidence', 0):.2f}",
78
+ help="How confident the AI model is in its prediction (0=uncertain, 1=very confident)"
79
+ )
80
+
81
+ with col2:
82
+ st.metric(
83
+ "Decision Threshold",
84
+ f"{main_result.get('threshold_used', 0):.2f}",
85
+ help="The threshold used for this text. Probability above this = ad classification"
86
+ )
87
+ st.metric(
88
+ "Rule Confidence",
89
+ f"{main_result.get('rule_confidence', 0):.2f}",
90
+ help="Confidence based on detected ad indicators (phone, price, etc.)"
91
+ )
92
+
93
+ # 3. Probability Breakdown
94
+ st.subheader("🔍 Probability Breakdown")
95
+
96
+ with st.expander("**How the final decision was made**", expanded=True):
97
+ st.markdown(f"""
98
+ **Initial Model Prediction:** `{main_result.get('promotion_prob', 0):.3f}`
99
+ The raw probability from the XLM-RoBERTa model that this text belongs to the 'Promotion' category.
100
+
101
+ **Ensemble Ad Signal:** `{main_result.get('ensemble_ad_signal', 0):.3f}`
102
+ Combined probability from ad-like categories (Promotion, Obituary, Call for participation) weighted at 70%,
103
+ plus inverse of non-ad categories (News, Opinion, Article, Report) weighted at 30%.
104
+
105
+ **Final Probability:** `{main_result.get('promotion_prob_final', 0):.3f}`
106
+ Blended score: 85% initial model prediction + 15% ensemble signal, with rule-based adjustments applied.
107
+
108
+ **Decision:** {'✅ **AD**' if result_type == 'ad' else '❌ **NON-AD**'} (final probability {'≥' if result_type == 'ad' else '<'} threshold of {main_result.get('threshold_used', 0):.3f})
109
+ """)
110
+
111
+ # 4. Model Details
112
+ st.subheader("🤖 Model Analysis")
113
+
114
+ with st.expander("**Cross-genre classification details**"):
115
+ st.markdown(f"""
116
+ The model classifies text across multiple newspaper genres. Here's what it detected:
117
+
118
+ - **Top Predicted Genre:** `{main_result.get('xgenre_top_label', 'unknown')}`
119
+ - **Confidence in Top Genre:** `{main_result.get('xgenre_top_prob', 0):.3f}`
120
+
121
+ The model uses multiple genre signals to determine if content is promotional in nature.
122
+ Ad-like genres include: Promotion, Obituary, and Call for participation.
123
+ """)
124
+
125
+ # 5. Rule-based Features
126
+ st.subheader("📋 Rule-Based Indicators")
127
+
128
+ with st.expander("**Detected advertisement patterns**"):
129
+ st.markdown(f"""
130
+ **Rule Score:** `{main_result.get('rule_score', 0):.2f}` / 10.0
131
+ This score is calculated from detected patterns common in advertisements:
132
+
133
+ - **Price mentions** (CHF, Fr., €, $) — weight: 2.0
134
+ - **Phone numbers** (contact information) — weight: 2.0
135
+ - **Ad cue words** (à vendre, zu verkaufen, prix, etc.) — weight: 1.5
136
+ - **Area measurements** (m², square meters) — weight: 1.0
137
+ - **Room counts** (pieces, Zimmer) — weight: 1.0
138
+ - **Address indicators** (Rue, Avenue, Strasse, Platz) — weight: 0.8
139
+ - **Swiss postal codes** (4-digit codes) — weight: 0.5
140
+
141
+ **Rule Confidence:** `{main_result.get('rule_confidence', 0):.2f}`
142
+ How strongly the detected patterns suggest this is an ad.
143
+ - Strong indicators (price, phone): 40% weight each
144
+ - Medium indicators (cue words, area, rooms): 20% weight each
145
+ - Weak indicators (address, zip): 10% weight each
146
+
147
+ **Rule Influence:**
148
+ When model confidence < 0.75, rule-based signals help adjust the final probability.
149
+ Strong rule signals (score ≥ 4.0, confidence > 0.7) can boost the probability by up to 15%.
150
+ Special combinations like price + phone number receive additional boosts.
151
+ """)
152
+
153
+ # 6. Adaptive Thresholding
154
+ st.subheader("⚙️ Adaptive Threshold")
155
+
156
+ with st.expander("**Why this threshold was chosen**"):
157
+ st.markdown(f"""
158
+ The threshold of `{main_result.get('threshold_used', 0):.3f}` was determined by:
159
+
160
+ 1. **Language-specific baseline:** Different languages have different base thresholds (e.g., French: 0.0755, other: 0.9991)
161
+ 2. **Text length adjustment:** Shorter texts (< 30 words) get a reduced threshold (bonus: 0.2) to account for brevity
162
+ 3. **Historical accuracy tuning:** Thresholds are calibrated on historical newspaper data to balance precision and recall
163
+
164
+ This adaptive approach ensures the classifier works effectively across different languages and text lengths.
165
+ """)
166
 
167
+ # 7. Raw Diagnostics
168
+ with st.expander("🔧 **Raw diagnostic data (JSON)**"):
169
+ st.json(main_result)
170
 
171
  except Exception as e:
172
  st.error(f"Error during processing: {e}")