Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

labelit-mini-ner / app.py

Suzana

Create app.py

f3b49b2 verified 7 months ago

raw

history blame

3.64 kB

	import gradio as gr
	import pandas as pd
	from pathlib import Path

	# Global token storage
	token_df = pd.DataFrame()

	# Generate generic sample sentences
	def make_sample_data(n=100):
	people = ["Alice","Bob","Charlie","Diane","Eve"]
	orgs = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
	locs = ["Paris","New York","London","Tokyo","Sydney"]
	verbs = ["visited","joined","founded","traveled to","met with"]
	rows = []
	for i in range(n):
	p = people[i % len(people)]
	v = verbs[i % len(verbs)]
	o = orgs[i % len(orgs)]
	l = locs[i % len(locs)]
	rows.append({"text": f"{p} {v} {o} in {l}."})
	return pd.DataFrame(rows)

	def load_data(file):
	global token_df
	# Load user CSV or fallback to sample
	if file:
	df = pd.read_csv(file.name)
	else:
	df = make_sample_data(100)
	if "text" not in df.columns:
	return (
	gr.update(visible=False),
	"❌ CSV must contain a `text` column.",
	gr.update(visible=False)
	)
	# Tokenize into (sentence_id, token, label)
	records = []
	for sid, txt in enumerate(df["text"]):
	for tok in txt.split():
	records.append({"sentence_id": sid, "token": tok, "label": "O"})
	token_df = pd.DataFrame(records)
	return (
	gr.update(value=token_df, visible=True),
	f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
	gr.update(visible=True)
	)

	def save_edits(table):
	global token_df
	token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
	return "💾 Edits saved."

	def download_tokens():
	token_df.to_csv("raw_tokens.csv", index=False)
	return Path("raw_tokens.csv")

	def download_iob():
	# Convert to IOB
	iob, prev = [], {}
	for _, r in token_df.iterrows():
	sid, lbl = r["sentence_id"], r["label"]
	if lbl == "O":
	iob.append("O")
	prev[sid] = None
	else:
	tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
	iob.append(tag)
	prev[sid] = lbl
	out = token_df.copy()
	out["iob"] = iob
	out.to_csv("ner_iob.csv", index=False)
	return Path("ner_iob.csv")

	with gr.Blocks() as app:
	gr.Markdown("# 🏷️ Label It! Mini-NER")
	gr.Markdown("Step 1: Upload a CSV with a `text` column, or leave blank for sample sentences.")

	with gr.Row():
	file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
	load_btn = gr.Button("Load Data")

	status = gr.Textbox(label="Status", interactive=False)

	table = gr.Dataframe(
	headers=["sentence_id","token","label"],
	editable=True,
	visible=False,
	label="📝 Annotate Tokens"
	)

	with gr.Row(visible=False) as actions:
	save_btn = gr.Button("💾 Save Edits")
	dl_tokens = gr.DownloadButton(
	fn=download_tokens,
	file_name="raw_tokens.csv",
	label="⬇️ Download Tokens CSV"
	)
	dl_iob = gr.DownloadButton(
	fn=download_iob,
	file_name="ner_iob.csv",
	label="⬇️ Download IOB CSV"
	)

	# Bind events
	load_btn.click(
	load_data,
	inputs=file_in,
	outputs=[table, status, actions]
	)
	save_btn.click(
	save_edits,
	inputs=table,
	outputs=status
	)

	gr.Markdown("""
	Step 2:
	- Click into the label column and type one of:
	`PER`, `ORG`, `LOC`, or leave as `O`.
	- Save Edits, then download your token CSV or IOB‐tagged CSV.
	""")

	app.launch()