Spaces:
Sleeping
Sleeping
maslionok
commited on
Commit
·
eff2c39
1
Parent(s):
7c3940e
added some explanations to the output
Browse files
app.py
CHANGED
|
@@ -8,6 +8,11 @@ st.set_page_config(page_title="Impresso Ad Classifier", layout="centered")
|
|
| 8 |
|
| 9 |
st.title("📰 Impresso Ad Classifier")
|
| 10 |
st.markdown("Enter text below to classify it as an Advertisement or Non-Advertisement using the `ad_model_pipeline`.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# --- LOAD PIPELINE ---
|
| 13 |
@st.cache_resource
|
|
@@ -23,7 +28,15 @@ except Exception as e:
|
|
| 23 |
st.stop()
|
| 24 |
|
| 25 |
# --- USER INTERFACE ---
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
if st.button("Classify", type="primary"):
|
| 29 |
if text_input.strip():
|
|
@@ -45,16 +58,115 @@ if st.button("Classify", type="primary"):
|
|
| 45 |
# 1. Visual Header
|
| 46 |
result_type = main_result.get('type', 'unknown')
|
| 47 |
if result_type == 'ad':
|
| 48 |
-
st.success(f"### Result: ADVERTISEMENT")
|
| 49 |
else:
|
| 50 |
-
st.info(f"### Result: NON-ADVERTISEMENT")
|
| 51 |
|
| 52 |
-
# 2.
|
| 53 |
-
st.subheader("
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
st.
|
|
|
|
| 58 |
|
| 59 |
except Exception as e:
|
| 60 |
st.error(f"Error during processing: {e}")
|
|
|
|
| 8 |
|
| 9 |
st.title("📰 Impresso Ad Classifier")
|
| 10 |
st.markdown("Enter text below to classify it as an Advertisement or Non-Advertisement using the `ad_model_pipeline`.")
|
| 11 |
+
st.info("""
|
| 12 |
+
**Now supports German and French!**
|
| 13 |
+
|
| 14 |
+
You can classify texts in English, German, or French. The classifier automatically adapts its thresholds and rules for these languages.
|
| 15 |
+
""")
|
| 16 |
|
| 17 |
# --- LOAD PIPELINE ---
|
| 18 |
@st.cache_resource
|
|
|
|
| 28 |
st.stop()
|
| 29 |
|
| 30 |
# --- USER INTERFACE ---
|
| 31 |
+
EXAMPLE_TEXT = (
|
| 32 |
+
"Nouveaux exploits des pilotes suisses Le record suisse de vol avee but fixé et retour au point de départ pour planeurs biplaces, détenu par Walter Meierhofer et Rosemarie Meierhofer avec 220 kilomètres, a été battu deux fois lundi. Partant de Diillikon, l'équipe HuberLiischer a en effet réussi un vol jusqn'aux Ponts-de-Martel et retour, soit nne distance de 278 km., tandis que Schàrli-Hodel atteignaient 261 km. qu'à La Chaux-de-Fonds et retour. D'autre part, à Birrfeld, Fritz _Dubg s. obtenu une distinction internationale ( Insigne or avec diamant)'pour avoir réalisé un gain d'altitude de 4000 mètres."
|
| 33 |
+
)
|
| 34 |
+
text_input = st.text_area(
|
| 35 |
+
"Input Text",
|
| 36 |
+
value=EXAMPLE_TEXT,
|
| 37 |
+
height=200,
|
| 38 |
+
placeholder="Paste historical text here..."
|
| 39 |
+
)
|
| 40 |
|
| 41 |
if st.button("Classify", type="primary"):
|
| 42 |
if text_input.strip():
|
|
|
|
| 58 |
# 1. Visual Header
|
| 59 |
result_type = main_result.get('type', 'unknown')
|
| 60 |
if result_type == 'ad':
|
| 61 |
+
st.success(f"### ✅ Result: ADVERTISEMENT")
|
| 62 |
else:
|
| 63 |
+
st.info(f"### ℹ️ Result: NON-ADVERTISEMENT")
|
| 64 |
|
| 65 |
+
# 2. Key Metrics
|
| 66 |
+
st.subheader("📊 Classification Metrics")
|
| 67 |
+
|
| 68 |
+
col1, col2 = st.columns(2)
|
| 69 |
+
with col1:
|
| 70 |
+
st.metric(
|
| 71 |
+
"Final Probability",
|
| 72 |
+
f"{main_result.get('promotion_prob_final', 0):.2f}",
|
| 73 |
+
help="The final probability that this text is an advertisement (after all adjustments)"
|
| 74 |
+
)
|
| 75 |
+
st.metric(
|
| 76 |
+
"Model Confidence",
|
| 77 |
+
f"{main_result.get('model_confidence', 0):.2f}",
|
| 78 |
+
help="How confident the AI model is in its prediction (0=uncertain, 1=very confident)"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
with col2:
|
| 82 |
+
st.metric(
|
| 83 |
+
"Decision Threshold",
|
| 84 |
+
f"{main_result.get('threshold_used', 0):.2f}",
|
| 85 |
+
help="The threshold used for this text. Probability above this = ad classification"
|
| 86 |
+
)
|
| 87 |
+
st.metric(
|
| 88 |
+
"Rule Confidence",
|
| 89 |
+
f"{main_result.get('rule_confidence', 0):.2f}",
|
| 90 |
+
help="Confidence based on detected ad indicators (phone, price, etc.)"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# 3. Probability Breakdown
|
| 94 |
+
st.subheader("🔍 Probability Breakdown")
|
| 95 |
+
|
| 96 |
+
with st.expander("**How the final decision was made**", expanded=True):
|
| 97 |
+
st.markdown(f"""
|
| 98 |
+
**Initial Model Prediction:** `{main_result.get('promotion_prob', 0):.3f}`
|
| 99 |
+
The raw probability from the XLM-RoBERTa model that this text belongs to the 'Promotion' category.
|
| 100 |
+
|
| 101 |
+
**Ensemble Ad Signal:** `{main_result.get('ensemble_ad_signal', 0):.3f}`
|
| 102 |
+
Combined probability from ad-like categories (Promotion, Obituary, Call for participation) weighted at 70%,
|
| 103 |
+
plus inverse of non-ad categories (News, Opinion, Article, Report) weighted at 30%.
|
| 104 |
+
|
| 105 |
+
**Final Probability:** `{main_result.get('promotion_prob_final', 0):.3f}`
|
| 106 |
+
Blended score: 85% initial model prediction + 15% ensemble signal, with rule-based adjustments applied.
|
| 107 |
+
|
| 108 |
+
**Decision:** {'✅ **AD**' if result_type == 'ad' else '❌ **NON-AD**'} (final probability {'≥' if result_type == 'ad' else '<'} threshold of {main_result.get('threshold_used', 0):.3f})
|
| 109 |
+
""")
|
| 110 |
+
|
| 111 |
+
# 4. Model Details
|
| 112 |
+
st.subheader("🤖 Model Analysis")
|
| 113 |
+
|
| 114 |
+
with st.expander("**Cross-genre classification details**"):
|
| 115 |
+
st.markdown(f"""
|
| 116 |
+
The model classifies text across multiple newspaper genres. Here's what it detected:
|
| 117 |
+
|
| 118 |
+
- **Top Predicted Genre:** `{main_result.get('xgenre_top_label', 'unknown')}`
|
| 119 |
+
- **Confidence in Top Genre:** `{main_result.get('xgenre_top_prob', 0):.3f}`
|
| 120 |
+
|
| 121 |
+
The model uses multiple genre signals to determine if content is promotional in nature.
|
| 122 |
+
Ad-like genres include: Promotion, Obituary, and Call for participation.
|
| 123 |
+
""")
|
| 124 |
+
|
| 125 |
+
# 5. Rule-based Features
|
| 126 |
+
st.subheader("📋 Rule-Based Indicators")
|
| 127 |
+
|
| 128 |
+
with st.expander("**Detected advertisement patterns**"):
|
| 129 |
+
st.markdown(f"""
|
| 130 |
+
**Rule Score:** `{main_result.get('rule_score', 0):.2f}` / 10.0
|
| 131 |
+
This score is calculated from detected patterns common in advertisements:
|
| 132 |
+
|
| 133 |
+
- **Price mentions** (CHF, Fr., €, $) — weight: 2.0
|
| 134 |
+
- **Phone numbers** (contact information) — weight: 2.0
|
| 135 |
+
- **Ad cue words** (à vendre, zu verkaufen, prix, etc.) — weight: 1.5
|
| 136 |
+
- **Area measurements** (m², square meters) — weight: 1.0
|
| 137 |
+
- **Room counts** (pieces, Zimmer) — weight: 1.0
|
| 138 |
+
- **Address indicators** (Rue, Avenue, Strasse, Platz) — weight: 0.8
|
| 139 |
+
- **Swiss postal codes** (4-digit codes) — weight: 0.5
|
| 140 |
+
|
| 141 |
+
**Rule Confidence:** `{main_result.get('rule_confidence', 0):.2f}`
|
| 142 |
+
How strongly the detected patterns suggest this is an ad.
|
| 143 |
+
- Strong indicators (price, phone): 40% weight each
|
| 144 |
+
- Medium indicators (cue words, area, rooms): 20% weight each
|
| 145 |
+
- Weak indicators (address, zip): 10% weight each
|
| 146 |
+
|
| 147 |
+
**Rule Influence:**
|
| 148 |
+
When model confidence < 0.75, rule-based signals help adjust the final probability.
|
| 149 |
+
Strong rule signals (score ≥ 4.0, confidence > 0.7) can boost the probability by up to 15%.
|
| 150 |
+
Special combinations like price + phone number receive additional boosts.
|
| 151 |
+
""")
|
| 152 |
+
|
| 153 |
+
# 6. Adaptive Thresholding
|
| 154 |
+
st.subheader("⚙️ Adaptive Threshold")
|
| 155 |
+
|
| 156 |
+
with st.expander("**Why this threshold was chosen**"):
|
| 157 |
+
st.markdown(f"""
|
| 158 |
+
The threshold of `{main_result.get('threshold_used', 0):.3f}` was determined by:
|
| 159 |
+
|
| 160 |
+
1. **Language-specific baseline:** Different languages have different base thresholds (e.g., French: 0.0755, other: 0.9991)
|
| 161 |
+
2. **Text length adjustment:** Shorter texts (< 30 words) get a reduced threshold (bonus: 0.2) to account for brevity
|
| 162 |
+
3. **Historical accuracy tuning:** Thresholds are calibrated on historical newspaper data to balance precision and recall
|
| 163 |
+
|
| 164 |
+
This adaptive approach ensures the classifier works effectively across different languages and text lengths.
|
| 165 |
+
""")
|
| 166 |
|
| 167 |
+
# 7. Raw Diagnostics
|
| 168 |
+
with st.expander("🔧 **Raw diagnostic data (JSON)**"):
|
| 169 |
+
st.json(main_result)
|
| 170 |
|
| 171 |
except Exception as e:
|
| 172 |
st.error(f"Error during processing: {e}")
|