import gradio as gr from transformers import pipeline from textblob import TextBlob from collections import defaultdict import pandas as pd from tabulate import tabulate # Initialize summarization pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") def generate_category_summaries(df): """Generate product summaries in table format""" summaries = {} for category in df['cluster_name'].unique(): category_df = df[df['cluster_name'] == category] if len(category_df) < 10: continue product_stats = get_product_stats(category_df) if len(product_stats) < 3: continue top_products, worst_product = get_top_and_worst_products(product_stats) product_details = analyze_top_products(top_products) # Format as tables summary_tables = format_tables(category, product_details, worst_product) summaries[category] = summary_tables return summaries def format_tables(category, product_details, worst_product): """Format all sections as tables""" tables = [] # Top Products Table top_table = [] for product in product_details: top_table.append([ product['name'], f"★{product['rating']:.1f}", product['review_count'], "\n".join(product['pros']), "\n".join(product['cons']) ]) tables.append({ 'section': f"TOP PRODUCTS IN {category.upper()}", 'headers': ["Product", "Rating", "Reviews", "Pros", "Cons"], 'data': top_table }) # Key Differences Table common_pros = set(product_details[0]['pros']) for product in product_details[1:]: common_pros.intersection_update(product['pros']) diff_table = [] for product in product_details: unique_pros = [p for p in product['pros'] if p not in common_pros] if unique_pros: diff_table.append([product['name'], ", ".join(unique_pros)]) if diff_table: tables.append({ 'section': "KEY DIFFERENCES", 'headers': ["Product", "Unique Features"], 'data': diff_table }) # Worst Product Table if not worst_product.empty: worst = worst_product.iloc[0] _, cons = analyze_sentiment(worst['reviews']) tables.append({ 'section': "PRODUCT TO AVOID", 'headers': ["Product", "Rating", "Reasons to Avoid"], 'data': [[ worst_product.index[0], f"★{worst['avg_rating']:.1f}", ", ".join(cons[:3]) if cons else "Consistently poor ratings" ]] }) return tables def get_product_stats(category_df): """Calculate product statistics from dataframe""" stats = category_df.groupby('name').agg({ 'rating': ['mean', 'count'], 'text': list }) stats.columns = ['avg_rating', 'review_count', 'reviews'] return stats[stats['review_count'] >= 5] def get_top_and_worst_products(product_stats): """Identify best and worst performing products""" return ( product_stats.nlargest(3, 'avg_rating'), product_stats.nsmallest(1, 'avg_rating') ) def analyze_top_products(top_products): """Extract pros/cons from top products' reviews""" product_details = [] for product, row in top_products.iterrows(): pros, cons = analyze_sentiment(row['reviews']) product_details.append({ 'name': product, 'rating': row['avg_rating'], 'review_count': row['review_count'], 'pros': pros[:3] or ["no significant positive feedback"], 'cons': cons[:3] or ["no major complaints"] }) return product_details def analyze_sentiment(reviews): """Perform sentiment analysis on reviews""" pros = defaultdict(int) cons = defaultdict(int) for review in reviews: blob = TextBlob(review) for sentence in blob.sentences: polarity = sentence.sentiment.polarity words = [word for word, tag in blob.tags if tag in ('NN', 'NNS', 'JJ', 'JJR', 'JJS')] if polarity > 0.3: # Positive for word in words: pros[word] += 1 elif polarity < -0.3: # Negative for word in words: cons[word] += 1 # Filter and sort results pros_sorted = [k for k, _ in sorted(pros.items(), key=lambda x: -x[1])] if pros else [] cons_sorted = [k for k, _ in sorted(cons.items(), key=lambda x: -x[1])] if cons else [] return pros_sorted, cons_sorted def format_for_gradio(summaries): """Convert summary tables to HTML for Gradio display""" outputs = [] for category, tables in summaries.items(): category_html = f"
| ', ' | ') table_html = table_html.replace(' | ', ' | ')
category_html += table_html
outputs.append(category_html)
return " ".join(outputs) def analyze_reviews(df): """Main function to process data and generate summaries""" summaries = generate_category_summaries(df) return format_for_gradio(summaries) # Create Gradio interface with gr.Blocks(title="Amazon Product Review Analyzer", theme=gr.themes.Soft()) as demo: gr.Markdown("# Amazon Product Review Analyzer") gr.Markdown("Analyzing top products and reviews across categories") with gr.Row(): with gr.Column(): gr.Markdown("### Product Categories Found") category_dropdown = gr.Dropdown( choices=df['cluster_name'].unique().tolist(), label="Select a Category", interactive=True ) analyze_btn = gr.Button("Analyze Selected Category", variant="primary") with gr.Column(): gr.Markdown("### All Categories Summary") all_categories_btn = gr.Button("Analyze All Categories", variant="secondary") output_html = gr.HTML(label="Analysis Results") # Button actions category_dropdown.change( fn=lambda x: gr.update(interactive=bool(x)), inputs=category_dropdown, outputs=analyze_btn ) analyze_btn.click( fn=lambda cat: analyze_reviews(df[df['cluster_name'] == cat]), inputs=category_dropdown, outputs=output_html ) all_categories_btn.click( fn=lambda: analyze_reviews(df), outputs=output_html ) # Launch the interface demo.launch() |
|---|