|
|
import gradio as gr |
|
|
from transformers import pipeline |
|
|
from textblob import TextBlob |
|
|
from collections import defaultdict |
|
|
import pandas as pd |
|
|
from tabulate import tabulate |
|
|
|
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
|
|
|
|
def generate_category_summaries(df): |
|
|
"""Generate product summaries in table format""" |
|
|
summaries = {} |
|
|
|
|
|
for category in df['cluster_name'].unique(): |
|
|
category_df = df[df['cluster_name'] == category] |
|
|
|
|
|
if len(category_df) < 10: |
|
|
continue |
|
|
|
|
|
product_stats = get_product_stats(category_df) |
|
|
if len(product_stats) < 3: |
|
|
continue |
|
|
|
|
|
top_products, worst_product = get_top_and_worst_products(product_stats) |
|
|
product_details = analyze_top_products(top_products) |
|
|
|
|
|
|
|
|
summary_tables = format_tables(category, product_details, worst_product) |
|
|
summaries[category] = summary_tables |
|
|
|
|
|
return summaries |
|
|
|
|
|
def format_tables(category, product_details, worst_product): |
|
|
"""Format all sections as tables""" |
|
|
tables = [] |
|
|
|
|
|
|
|
|
top_table = [] |
|
|
for product in product_details: |
|
|
top_table.append([ |
|
|
product['name'], |
|
|
f"★{product['rating']:.1f}", |
|
|
product['review_count'], |
|
|
"\n".join(product['pros']), |
|
|
"\n".join(product['cons']) |
|
|
]) |
|
|
|
|
|
tables.append({ |
|
|
'section': f"TOP PRODUCTS IN {category.upper()}", |
|
|
'headers': ["Product", "Rating", "Reviews", "Pros", "Cons"], |
|
|
'data': top_table |
|
|
}) |
|
|
|
|
|
|
|
|
common_pros = set(product_details[0]['pros']) |
|
|
for product in product_details[1:]: |
|
|
common_pros.intersection_update(product['pros']) |
|
|
|
|
|
diff_table = [] |
|
|
for product in product_details: |
|
|
unique_pros = [p for p in product['pros'] if p not in common_pros] |
|
|
if unique_pros: |
|
|
diff_table.append([product['name'], ", ".join(unique_pros)]) |
|
|
|
|
|
if diff_table: |
|
|
tables.append({ |
|
|
'section': "KEY DIFFERENCES", |
|
|
'headers': ["Product", "Unique Features"], |
|
|
'data': diff_table |
|
|
}) |
|
|
|
|
|
|
|
|
if not worst_product.empty: |
|
|
worst = worst_product.iloc[0] |
|
|
_, cons = analyze_sentiment(worst['reviews']) |
|
|
tables.append({ |
|
|
'section': "PRODUCT TO AVOID", |
|
|
'headers': ["Product", "Rating", "Reasons to Avoid"], |
|
|
'data': [[ |
|
|
worst_product.index[0], |
|
|
f"★{worst['avg_rating']:.1f}", |
|
|
", ".join(cons[:3]) if cons else "Consistently poor ratings" |
|
|
]] |
|
|
}) |
|
|
|
|
|
return tables |
|
|
|
|
|
def get_product_stats(category_df): |
|
|
"""Calculate product statistics from dataframe""" |
|
|
stats = category_df.groupby('name').agg({ |
|
|
'rating': ['mean', 'count'], |
|
|
'text': list |
|
|
}) |
|
|
stats.columns = ['avg_rating', 'review_count', 'reviews'] |
|
|
return stats[stats['review_count'] >= 5] |
|
|
|
|
|
def get_top_and_worst_products(product_stats): |
|
|
"""Identify best and worst performing products""" |
|
|
return ( |
|
|
product_stats.nlargest(3, 'avg_rating'), |
|
|
product_stats.nsmallest(1, 'avg_rating') |
|
|
) |
|
|
|
|
|
def analyze_top_products(top_products): |
|
|
"""Extract pros/cons from top products' reviews""" |
|
|
product_details = [] |
|
|
for product, row in top_products.iterrows(): |
|
|
pros, cons = analyze_sentiment(row['reviews']) |
|
|
product_details.append({ |
|
|
'name': product, |
|
|
'rating': row['avg_rating'], |
|
|
'review_count': row['review_count'], |
|
|
'pros': pros[:3] or ["no significant positive feedback"], |
|
|
'cons': cons[:3] or ["no major complaints"] |
|
|
}) |
|
|
return product_details |
|
|
|
|
|
def analyze_sentiment(reviews): |
|
|
"""Perform sentiment analysis on reviews""" |
|
|
pros = defaultdict(int) |
|
|
cons = defaultdict(int) |
|
|
|
|
|
for review in reviews: |
|
|
blob = TextBlob(review) |
|
|
for sentence in blob.sentences: |
|
|
polarity = sentence.sentiment.polarity |
|
|
words = [word for word, tag in blob.tags |
|
|
if tag in ('NN', 'NNS', 'JJ', 'JJR', 'JJS')] |
|
|
|
|
|
if polarity > 0.3: |
|
|
for word in words: |
|
|
pros[word] += 1 |
|
|
elif polarity < -0.3: |
|
|
for word in words: |
|
|
cons[word] += 1 |
|
|
|
|
|
|
|
|
pros_sorted = [k for k, _ in sorted(pros.items(), key=lambda x: -x[1])] if pros else [] |
|
|
cons_sorted = [k for k, _ in sorted(cons.items(), key=lambda x: -x[1])] if cons else [] |
|
|
|
|
|
return pros_sorted, cons_sorted |
|
|
|
|
|
def format_for_gradio(summaries): |
|
|
"""Convert summary tables to HTML for Gradio display""" |
|
|
outputs = [] |
|
|
for category, tables in summaries.items(): |
|
|
category_html = f"<h2 style='color: #4a6baf;'>{category.upper()}</h2>" |
|
|
|
|
|
for table in tables: |
|
|
table_html = f"<h3 style='color: #3a5a8a;'>{table['section']}</h3>" |
|
|
table_html += tabulate( |
|
|
table['data'], |
|
|
headers=table['headers'], |
|
|
tablefmt="html", |
|
|
stralign="left", |
|
|
numalign="center" |
|
|
) |
|
|
table_html = table_html.replace('<table>', '<table style="width:100%; border-collapse: collapse; margin-bottom: 20px;">') |
|
|
table_html = table_html.replace('<th>', '<th style="background-color: #f2f2f2; padding: 8px; text-align: left; border: 1px solid #ddd;">') |
|
|
table_html = table_html.replace('<td>', '<td style="padding: 8px; border: 1px solid #ddd;">') |
|
|
category_html += table_html |
|
|
|
|
|
outputs.append(category_html) |
|
|
|
|
|
return "<hr>".join(outputs) |
|
|
|
|
|
def analyze_reviews(df): |
|
|
"""Main function to process data and generate summaries""" |
|
|
summaries = generate_category_summaries(df) |
|
|
return format_for_gradio(summaries) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Amazon Product Review Analyzer", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# Amazon Product Review Analyzer") |
|
|
gr.Markdown("Analyzing top products and reviews across categories") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### Product Categories Found") |
|
|
category_dropdown = gr.Dropdown( |
|
|
choices=df['cluster_name'].unique().tolist(), |
|
|
label="Select a Category", |
|
|
interactive=True |
|
|
) |
|
|
analyze_btn = gr.Button("Analyze Selected Category", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("### All Categories Summary") |
|
|
all_categories_btn = gr.Button("Analyze All Categories", variant="secondary") |
|
|
|
|
|
output_html = gr.HTML(label="Analysis Results") |
|
|
|
|
|
|
|
|
category_dropdown.change( |
|
|
fn=lambda x: gr.update(interactive=bool(x)), |
|
|
inputs=category_dropdown, |
|
|
outputs=analyze_btn |
|
|
) |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=lambda cat: analyze_reviews(df[df['cluster_name'] == cat]), |
|
|
inputs=category_dropdown, |
|
|
outputs=output_html |
|
|
) |
|
|
|
|
|
all_categories_btn.click( |
|
|
fn=lambda: analyze_reviews(df), |
|
|
outputs=output_html |
|
|
) |
|
|
|
|
|
|
|
|
demo.launch() |