Spaces:

ZombitX64
/

AutoGDataset

Paused

App Files Files Community

Nattapong Tapachoom commited on Sep 2

Commit

084df26

1 Parent(s): b410a7f

Refactor app.py to improve model loading and PDF processing; update dataset generation logic and enhance UI components

Browse files

Files changed (1) hide show

app.py +110 -640

app.py CHANGED Viewed

@@ -6,28 +6,30 @@ from datetime import datetime
 from typing import List, Dict, Any, Tuple
 import gradio as gr
-# Detect if OAuth is available (enabled on Spaces when hf_oauth: true)
-OAUTH_AVAILABLE = bool(os.getenv("OAUTH_CLIENT_ID"))
-# Require login to use the app. Defaults to on only when OAuth is available.
-_default_require = "1" if OAUTH_AVAILABLE else "0"
-REQUIRE_LOGIN = os.getenv("REQUIRE_LOGIN", _default_require).strip().lower() in ("1", "true", "yes", "y")
-try:
-    from pypdf import PdfReader
-except Exception:  # pragma: no cover - lazy import warning only
-    PdfReader = None  # type: ignore
-# LangChain components
-try:
-    from langchain_core.prompts import PromptTemplate
-    from langchain_core.output_parsers import JsonOutputParser
-    from langchain_huggingface import HuggingFaceEndpoint
-except Exception:
-    PromptTemplate = None  # type: ignore
-    JsonOutputParser = None  # type: ignore
-    HuggingFaceEndpoint = None  # type: ignore
 def ensure_output_dir() -> str:
@@ -37,11 +39,6 @@ def ensure_output_dir() -> str:
 def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
-    if not files:
-        return "", []
-    if PdfReader is None:
-        raise RuntimeError("pypdf is not installed. Please add it to requirements.txt or pip install pypdf.")
     docs = []
     combined_text_parts: List[str] = []
     for f in files:
@@ -49,11 +46,7 @@ def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
         reader = PdfReader(path)
         pages_text = []
         for i, page in enumerate(reader.pages):
-            try:
-                text = page.extract_text() or ""
-            except Exception:
-                text = ""
-            # Normalize whitespace
             text = re.sub(r"\s+", " ", text).strip()
             if text:
                 pages_text.append({"page": i + 1, "text": text})
@@ -63,665 +56,142 @@ def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
     return combined_text, docs
-def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200, max_chunks: int = 5) -> List[Dict[str, Any]]:
     text = text.strip()
     if not text:
         return []
-    chunks: List[Dict[str, Any]] = []
     start = 0
     n = len(text)
     while start < n and len(chunks) < max_chunks:
         end = min(start + chunk_size, n)
         chunk = text[start:end]
-        # try to end on a sentence boundary
-        if end < n:
-            m = re.search(r"[\.!?]\s", text[end - 200:end] if end - 200 > start else text[start:end])
-            if m:
-                end = start + (m.end())
-                chunk = text[start:end]
-        chunks.append({"index": len(chunks), "start": start, "end": end, "text": chunk})
         if end >= n:
             break
         start = max(end - overlap, 0)
-        if start == end:  # safety
-            start += 1
     return chunks
-DEFAULT_QA_PROMPT_TMPL = (
-    'คุณเป็นผู้สร้างชุดข้อมูลที่เป็นประโยชน์ อ่านเนื้อหาที่ให้มาและสร้างคู่คำถาม-คำตอบที่มีคุณภาพสูงและตรงตามข้อเท็จจริง จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-    'ส่งคืนเฉพาะ JSON array ที่มี objects ในรูปแบบ {{"question": str, "answer": str}} เท่านั้น ไม่ต้องใส่ข้อความเพิ่มเติม คำอธิบาย หรือ code fences\n\n'
-    'เนื้อหา:\n{content}\n'
 )
-TASK_TEMPLATES: Dict[str, str] = {
-    "QA": DEFAULT_QA_PROMPT_TMPL,
-    "Summarization": (
-        'สรุปเนื้อหาต่อไปนี้เป็นบทสรุปที่กระชับ จำนวน {min_pairs} ถึง {max_pairs} บทสรุป โดยครอบคลุมข้อมูลสำคัญ '
-        'ส่งคืนเฉพาะ JSON array ที่มี objects ในรูปแบบ {{"summary": str}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Keywords": (
-        'แยกคำสำคัญหรือวลีสำคัญจากเนื้อหา จำนวน {min_pairs} ถึง {max_pairs} คำ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"keyword": str}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "NER": (
-        'แยกเอนทิตีที่มีชื่อเฉพาะจากเนื้อหา ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"text": str, "label": str, "start": int, "end": int}} '
-        'ป้ายกำกับควรเป็นประเภทมาตรฐาน เช่น PER (บุคคล), ORG (องค์กร), LOC (สถานที่), MISC (อื่นๆ){ner_labels_clause}\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Classification": (
-        'จำแนกเนื้อหาตามป้ายกำกับต่อไปนี้: {labels} {multi_label_clause} '
-        'ส่งคืนเฉพาะ JSON array ที่มี objects ในรูปแบบ {{"labels": [str], "rationale": str}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "MCQ": (
-        'สร้างคำถามแบบเลือกตอบจากเนื้อหา จำนวน {min_pairs} ถึง {max_pairs} ข้อ แต่ละข้อมี {num_options} ตัวเลือก '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"question": str, "options": [str], "answer_index": int}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "True/False": (
-        'สร้างข้อความจริง/เท็จที่อิงจากเนื้อหาเท่านั้น จำนวน {min_pairs} ถึง {max_pairs} ข้อความ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"statement": str, "answer": bool, "explanation": str}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Translation": (
-        'แปลเนื้อหาเป็น{target_language} สร้างคู่ประโยคแบบคู่ขนาน จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"source": str, "target": str}} เท่านั้น ไม่ต้องมีข้อความเพิ่มเติม\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "RLHF": (
-        'สร้างข้อมูลสำหรับ Reinforcement Learning from Human Feedback (RLHF) จากเนื้อหานี้ '
-        'สร้างคำถามและการตอบสนองหลายแบบ พร้อมคะแนนความต้องการของมนุษย์ จำนวน {min_pairs} ถึง {max_pairs} ชุด '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"prompt": str, "responses": [str], "scores": [float], "preferred_response": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "DPO": (
-        'สร้างข้อมูลสำหรับ Direct Preference Optimization (DPO) จากเนื้อหานี้ '
-        'สร้างคำถามพร้อมการตอบสนองที่ดีและไม่ดี จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"prompt": str, "chosen": str, "rejected": str, "reason": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Instruction_Following": (
-        'สร้างคำสั่งและการตอบสนองสำหรับการฝึกการทำตามคำสั่ง จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"instruction": str, "input": str, "output": str, "difficulty": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Constitutional_AI": (
-        'สร้างข้อมูลสำหรับ Constitutional AI โดยสร้างคำถามที่อาจมีปัญหาทางจริยธรรมและคำตอบที่เหมาะสม '
-        'จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"problematic_prompt": str, "constitutional_response": str, "principle": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Chain_of_Thought": (
-        'สร้างตัวอย่างการคิดแบบขั้นตอน (Chain of Thought) จากเนื้อหา จำนวน {min_pairs} ถึง {max_pairs} ตัวอย่าง '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"problem": str, "thinking_steps": [str], "final_answer": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Dialogue": (
-        'สร้างบทสนทนาระหว่างผู้ใช้และผู้ช่วย AI จากเนื้อหา จำนวน {min_pairs} ถึง {max_pairs} บทสนทนา '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"dialogue": [{{"role": str, "content": str}}], "context": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-    "Thai_Culture": (
-        'สร้างคำถาม-คำตอบเกี่ยวกับวัฒนธรรมไทยจากเนื้อหา เน้นความเข้าใจภาษาไทยและบริบททางวัฒนธรรม '
-        'จำนวน {min_pairs} ถึง {max_pairs} คู่ '
-        'ส่งคืนเฉพาะ JSON array ของ objects ที่มี {{"question_th": str, "answer_th": str, "cultural_context": str}} เท่านั้น\n\n'
-        'เนื้อหา:\n{content}\n'
-    ),
-}
-def extract_json_array(text: str) -> List[Dict[str, Any]]:
-    if not text:
-        return []
-    # Remove code fences
-    text = re.sub(r"```[a-zA-Z]*", "```", text)
-    text = text.replace("```", "")
-    # Find first [ ... ] block
-    start = text.find("[")
-    end = text.rfind("]")
-    if start != -1 and end != -1 and end > start:
-        candidate = text[start : end + 1]
-    else:
-        candidate = text
-    try:
-        data = json.loads(candidate)
-        if isinstance(data, list):
-            # normalize
-            norm = []
-            for item in data:
-                if not isinstance(item, dict):
-                    continue
-                q = str(item.get("question", "").strip())
-                a = str(item.get("answer", "").strip())
-                if q and a:
-                    norm.append({"question": q, "answer": a})
-            return norm
-    except Exception:
-        pass
-    return []
-def build_langchain(model_id: str, hf_token: str | None, max_new_tokens: int, temperature: float, template: str):
-    if any(x is None for x in [PromptTemplate, JsonOutputParser, HuggingFaceEndpoint]):
-        raise RuntimeError("langchain, langchain-community, and langchain-huggingface are required. Please add to requirements.txt.")
-    # Prompt
-    prompt = PromptTemplate.from_template(template)
-    # Model wrapper (Hugging Face Inference API)
-    llm = HuggingFaceEndpoint(
-        model=model_id,
-        token=hf_token,
-        task="text-generation",
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        do_sample=temperature > 0.0,
-    )
-    parser = JsonOutputParser()
-    chain = prompt | llm | parser
-    return chain
-def get_task_template(task: str, custom_instruction: str | None) -> str:
-    base = TASK_TEMPLATES.get(task, DEFAULT_QA_PROMPT_TMPL)
-    if custom_instruction and custom_instruction.strip():
-        # Allow user to override fully, but ensure {content} is present
-        if "{content}" not in custom_instruction:
-            custom_instruction = custom_instruction.strip() + "\n\nContent:\n{content}\n"
-        return custom_instruction
-    return base
-def normalize_items(task: str, data: Any) -> List[Dict[str, Any]]:
-    # Convert model output to list[dict] per task
-    items: List[Dict[str, Any]] = []
-    if data is None:
-        return items
-    if isinstance(data, str):
-        data = extract_json_array(data)
-    if isinstance(data, dict):
-        # handle wrappers like {"items": [...]}
-        if "items" in data and isinstance(data["items"], list):
-            data = data["items"]
-        else:
-            data = [data]
-    if isinstance(data, list):
-        # keywords may be list[str]
-        if task == "Keywords" and data and all(isinstance(x, str) for x in data):
-            return [{"keyword": x} for x in data if x]
-        for el in data:
-            if isinstance(el, dict):
-                items.append(el)
-    # Validate per-task required fields and normalize variants
-    norm: List[Dict[str, Any]] = []
-    for it in items:
-        if task == "QA":
-            q = str(it.get("question", "")).strip()
-            a = str(it.get("answer", "")).strip()
-            if q and a:
-                norm.append({"question": q, "answer": a})
-        elif task == "Summarization":
-            s = str(it.get("summary", "")).strip()
-            if s:
-                norm.append({"summary": s})
-        elif task == "Keywords":
-            k = it.get("keyword")
-            if isinstance(k, str) and k.strip():
-                norm.append({"keyword": k.strip()})
-            elif isinstance(it.get("keywords"), list):
-                for kw in it["keywords"]:
-                    if isinstance(kw, str) and kw.strip():
-                        norm.append({"keyword": kw.strip()})
-        elif task == "NER":
-            txt = it.get("text")
-            label = it.get("label")
-            start = it.get("start")
-            end = it.get("end")
-            if isinstance(txt, str) and isinstance(label, str) and isinstance(start, int) and isinstance(end, int):
-                norm.append({"text": txt, "label": label, "start": start, "end": end})
-            elif isinstance(it.get("entities"), list):
-                for ent in it["entities"]:
-                    if all(k in ent for k in ("text", "label", "start", "end")):
-                        norm.append({
-                            "text": str(ent.get("text", "")),
-                            "label": str(ent.get("label", "")),
-                            "start": int(ent.get("start", 0)),
-                            "end": int(ent.get("end", 0)),
-                        })
-        elif task == "Classification":
-            labels = it.get("labels")
-            if isinstance(labels, str):
-                labels = [labels]
-            if isinstance(labels, list):
-                labels = [str(x).strip() for x in labels if str(x).strip()]
-                rationale = str(it.get("rationale", "")).strip()
-                if labels:
-                    norm.append({"labels": labels, "rationale": rationale})
-        elif task == "MCQ":
-            q = it.get("question")
-            options = it.get("options")
-            answer_index = it.get("answer_index")
-            answer = it.get("answer")
-            if isinstance(options, list) and all(isinstance(o, str) for o in options) and isinstance(q, str):
-                if isinstance(answer_index, int):
-                    idx = answer_index
-                elif isinstance(answer, str) and answer in options:
-                    idx = options.index(answer)
-                else:
-                    continue
-                norm.append({"question": q, "options": options, "answer_index": idx})
-        elif task == "True/False":
-            st = it.get("statement")
-            ans = it.get("answer")
-            expl = it.get("explanation", "")
-            if isinstance(st, str):
-                if isinstance(ans, bool):
-                    val = ans
-                elif isinstance(ans, str):
-                    val = ans.strip().lower() in ("true", "t", "yes", "1")
-                else:
-                    continue
-                norm.append({"statement": st, "answer": val, "explanation": str(expl)})
-        elif task == "Translation":
-            src = it.get("source")
-            tgt = it.get("target")
-            if isinstance(src, str) and isinstance(tgt, str) and src.strip() and tgt.strip():
-                norm.append({"source": src, "target": tgt})
-        elif task == "RLHF":
-            prompt = it.get("prompt")
-            responses = it.get("responses")
-            scores = it.get("scores")
-            preferred = it.get("preferred_response")
-            if isinstance(prompt, str) and isinstance(responses, list) and isinstance(scores, list):
-                norm.append({
-                    "prompt": prompt,
-                    "responses": responses,
-                    "scores": scores,
-                    "preferred_response": str(preferred) if preferred else ""
-                })
-        elif task == "DPO":
-            prompt = it.get("prompt")
-            chosen = it.get("chosen")
-            rejected = it.get("rejected")
-            reason = it.get("reason", "")
-            if isinstance(prompt, str) and isinstance(chosen, str) and isinstance(rejected, str):
-                norm.append({
-                    "prompt": prompt,
-                    "chosen": chosen,
-                    "rejected": rejected,
-                    "reason": str(reason)
-                })
-        elif task == "Instruction_Following":
-            instruction = it.get("instruction")
-            input_text = it.get("input", "")
-            output = it.get("output")
-            difficulty = it.get("difficulty", "medium")
-            if isinstance(instruction, str) and isinstance(output, str):
-                norm.append({
-                    "instruction": instruction,
-                    "input": str(input_text),
-                    "output": output,
-                    "difficulty": str(difficulty)
-                })
-        elif task == "Constitutional_AI":
-            problematic = it.get("problematic_prompt")
-            constitutional = it.get("constitutional_response")
-            principle = it.get("principle", "")
-            if isinstance(problematic, str) and isinstance(constitutional, str):
-                norm.append({
-                    "problematic_prompt": problematic,
-                    "constitutional_response": constitutional,
-                    "principle": str(principle)
-                })
-        elif task == "Chain_of_Thought":
-            problem = it.get("problem")
-            steps = it.get("thinking_steps")
-            answer = it.get("final_answer")
-            if isinstance(problem, str) and isinstance(steps, list) and isinstance(answer, str):
-                norm.append({
-                    "problem": problem,
-                    "thinking_steps": steps,
-                    "final_answer": answer
-                })
-        elif task == "Dialogue":
-            dialogue = it.get("dialogue")
-            context = it.get("context", "")
-            if isinstance(dialogue, list):
-                norm.append({
-                    "dialogue": dialogue,
-                    "context": str(context)
-                })
-        elif task == "Thai_Culture":
-            question_th = it.get("question_th")
-            answer_th = it.get("answer_th")
-            cultural_context = it.get("cultural_context", "")
-            if isinstance(question_th, str) and isinstance(answer_th, str):
-                norm.append({
-                    "question_th": question_th,
-                    "answer_th": answer_th,
-                    "cultural_context": str(cultural_context)
-                })
-    return norm
-def generate_dataset(
-    user_profile: Any | None,
-    files: List[gr.File],
-    task: str,
-    preset_model: str,
-    custom_model_id: str,
-    hf_token: str,
-    chunk_size: int,
-    overlap: int,
-    max_chunks: int,
-    max_new_tokens: int,
-    temperature: float,
-    custom_instruction: str,
-    min_pairs: int,
-    max_pairs: int,
-    class_labels_text: str,
-    multi_label: bool,
-    target_language: str,
-    num_options: int,
-    ner_labels_text: str,
-):
-    # Enforce login if required
-    if REQUIRE_LOGIN and not user_profile:
-        return "กรุณาเข้าสู่ระบบก่อนเพื่อสร้างชุดข้อมูล", None, None
-    # Read and chunk
-    full_text, _docs = read_pdfs(files)
-    chunks = chunk_text(full_text, chunk_size=chunk_size, overlap=overlap, max_chunks=max_chunks)
     if not chunks:
-        return "ไม่สามารถแยกข้อความจากไฟล์ PDF ได้", None, None
-    model_id = (custom_model_id or "").strip() or preset_model
-    # Prepare template per task
-    base_template = get_task_template(task, custom_instruction)
-    # enrich template with conditional clauses
-    ner_clause = ""
-    if ner_labels_text.strip():
-        ner_clause = f" (limit to: {ner_labels_text.strip()})"
-    base_template = base_template.replace("{ner_labels_clause}", ner_clause)
-    if "{labels}" in base_template:
-        labels_text = class_labels_text.strip() or "[]"
-        base_template = base_template.replace("{labels}", labels_text)
-    if "{multi_label_clause}" in base_template:
-        base_template = base_template.replace("{multi_label_clause}", " Allow multiple labels." if multi_label else " Choose a single best label.")
-    if "{num_options}" in base_template:
-        base_template = base_template.replace("{num_options}", str(int(num_options)))
-    try:
-        chain = build_langchain(model_id, hf_token or None, max_new_tokens, temperature, base_template)
-    except Exception as e:
-        return f"ข้อผิดพลาดในการเตรียม LangChain: {e}", None, None
-    results: List[Dict[str, Any]] = []
     for ch in chunks:
-        try:
-            variables = {"content": ch["text"], "min_pairs": min_pairs, "max_pairs": max_pairs}
-            if "{target_language}" in base_template:
-                variables["target_language"] = target_language or "English"
-            data = chain.invoke(variables)
-            items = normalize_items(task, data)
-        except Exception:
-            # If parser fails, try best-effort extraction on raw string
             try:
-                raw = (PromptTemplate.from_template(base_template) | HuggingFaceEndpoint(model=model_id, token=hf_token, task="text-generation")).invoke(variables)  # type: ignore
-                items = normalize_items(task, raw)
             except Exception:
-                items = []
-        for it in items:
-            # Enrich with context and task
-            it["context"] = (ch["text"][:500] + ("..." if len(ch["text"]) > 500 else ""))
-            it["task"] = task
-            results.append(it)
     if not results:
-        return f"โมเดลไม่ได้ส่งคืนข้อมูลที่ถูกต้องสำหรับงาน {task} ลองปรับ prompt หรือโมเดล", None, None
-    # Deduplicate per task key
-    unique: List[Dict[str, Any]] = []
-    seen = set()
-    def key_of(item: Dict[str, Any]) -> str:
-        if task == "QA":
-            return (item.get("question") or "").strip().lower()
-        if task == "Summarization":
-            return (item.get("summary") or "").strip().lower()
-        if task == "Keywords":
-            return (item.get("keyword") or "").strip().lower()
-        if task == "NER":
-            return f"{item.get('text')}|{item.get('label')}|{item.get('start')}|{item.get('end')}"
-        if task == "Classification":
-            return ",".join(sorted([str(x).lower() for x in item.get("labels", [])]))
-        if task == "MCQ":
-            return (item.get("question") or "").strip().lower()
-        if task == "True/False":
-            return (item.get("statement") or "").strip().lower()
-        if task == "Translation":
-            return f"{item.get('source')}|{item.get('target')}"
-        if task == "RLHF":
-            return (item.get("prompt") or "").strip().lower()
-        if task == "DPO":
-            return (item.get("prompt") or "").strip().lower()
-        if task == "Instruction_Following":
-            return (item.get("instruction") or "").strip().lower()
-        if task == "Constitutional_AI":
-            return (item.get("problematic_prompt") or "").strip().lower()
-        if task == "Chain_of_Thought":
-            return (item.get("problem") or "").strip().lower()
-        if task == "Dialogue":
-            dialogue = item.get("dialogue", [])
-            if dialogue and isinstance(dialogue, list):
-                return str(dialogue[0].get("content", "")).strip().lower()
-            return ""
-        if task == "Thai_Culture":
-            return (item.get("question_th") or "").strip().lower()
-        return json.dumps(item, ensure_ascii=False)
-    for r in results:
-        k = key_of(r)
-        if k and k not in seen:
-            unique.append(r)
-            seen.add(k)
-    # Save to outputs
     outdir = ensure_output_dir()
     ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
-    safe_task = task.lower().replace("/", "-").replace(" ", "_")
-    json_path = os.path.join(outdir, f"dataset_{safe_task}_{ts}.json")
-    jsonl_path = os.path.join(outdir, f"dataset_{safe_task}_{ts}.jsonl")
     with io.open(json_path, "w", encoding="utf-8") as f:
-        json.dump(unique, f, ensure_ascii=False, indent=2)
     with io.open(jsonl_path, "w", encoding="utf-8") as f:
-        for item in unique:
             f.write(json.dumps(item, ensure_ascii=False) + "\n")
-    return f"สร้างข้อมูลสำเร็จ {len(unique)} รายการสำหรับงาน: {task} 🎉", json_path, jsonl_path
 PRESET_MODELS = [
-    # Thai-capable models
-    "openthaigpt/openthaigpt-1.0.0-alpha-7b-chat",
-    "scb10x/llama-3-typhoon-v1.5-8b-instruct",
-    "airesearch/wangchanberta-base-att-spm-uncased",
-    # Multilingual models good for Thai
-    "google/mt5-large",
-    "microsoft/mdeberta-v3-base",
-    "facebook/xglm-7.5B",
-    "microsoft/DialoGPT-medium",
-    # General powerful models
-    "HuggingFaceH4/zephyr-7b-beta",
     "mistralai/Mistral-7B-Instruct-v0.2",
-    "google/flan-t5-large",
     "meta-llama/Llama-2-7b-chat-hf",
-    "microsoft/DialoGPT-large",
 ]
-with gr.Blocks(title="AutoGDataset Thai - PDF to Dataset Generator") as demo:
-    gr.Markdown("""
-    # AutoGDataset Thai 🇹🇭
-    สร้างชุดข้อมูล (Dataset) ภาษาไทยจากไฟล์ PDF โดยใช้ LangChain กับโมเดล Hugging Face
-    **คุณสมบัติ:**
-    - รองรับงานหลากหลายประเภท: QA, RLHF, DPO, Constitutional AI และอื่นๆ
-    - เน้นการสร้างข้อมูลภาษาไทยคุณภาพสูง
-    - รองรับโมเดลภาษาไทยและ multilingual models
-    - สามารถปรับแต่ง prompt เพื่อเพิ่มประสิทธิภาพ
-    เลือกโมเดลที่มีอยู่หรือระบุ repo id ที่กำหนดเอง ระบุ `HF_TOKEN` หากจำเป็นสำหรับโมเดล
-    """)
-    # Login requirement (Hugging Face OAuth via Gradio LoginButton when available)
-    user_state = gr.State(value=None)
-    effective_require_login = bool(REQUIRE_LOGIN and OAUTH_AVAILABLE)
     with gr.Row():
-        login_info = gr.Markdown(
-            value=(
-                "กรุณาเข้าสู่ระบบด้วยบัญชี Hugging Face เพื่อใช้งานแอป"
-                if effective_require_login
-                else (
-                    "การเข้าสู่ระบบเป็นทางเลือก" if OAUTH_AVAILABLE else "ไม่ได้ตั้งค่าการเข้าสู่ระบบ OAuth ในการติดตั้งนี้"
-                )
-            ),
-            elem_id="login-info",
-        )
-    if OAUTH_AVAILABLE:
-        with gr.Row():
-            login_btn = gr.LoginButton(value="เข้าสู่ระบบด้วย Hugging Face")
     with gr.Row():
-        pdf_files = gr.File(label="อัปโหลดไฟล์ PDF", file_count="multiple", file_types=[".pdf"])
-    with gr.Group():
-        with gr.Row():
-            task = gr.Dropdown(
-                label="งานที่ต้องการ (Task Type)",
-                choices=[
-                    "QA",
-                    "Summarization",
-                    "Keywords",
-                    "NER",
-                    "Classification",
-                    "MCQ",
-                    "True/False",
-                    "Translation",
-                    "RLHF",
-                    "DPO",
-                    "Instruction_Following",
-                    "Constitutional_AI",
-                    "Chain_of_Thought",
-                    "Dialogue",
-                    "Thai_Culture",
-                ],
-                value="Thai_Culture",
-            )
-        with gr.Row():
-            preset_model = gr.Dropdown(label="โมเดลที่กำหนดไว้ (Preset Model)", choices=PRESET_MODELS, value=PRESET_MODELS[0])
-            custom_model_id = gr.Textbox(label="รหัสโมเดลกำหนดเอง (ไม่บังคับ)", placeholder="org/model-name")
-        with gr.Row():
-            hf_token = gr.Textbox(label="HF Token", type="password", value=os.environ.get("HF_TOKEN", ""), placeholder="hf_xxx (จำเป็นสำหรับหลายโมเดล)")
-        with gr.Row():
-            max_new_tokens = gr.Slider(64, 1024, value=512, step=16, label="จำนวน Token สูงสุด")
-            temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="อุณหภูมิ (ความสร้างสรรค์)")
-    with gr.Accordion("การตั้งค่าขั้นสูง (Advanced Settings)", open=False):
-        with gr.Row():
-            chunk_size = gr.Slider(500, 4000, value=1500, step=50, label="ขนาดส่วนข้อความ (ตัวอักษร)")
-            overlap = gr.Slider(0, 1000, value=200, step=50, label="การทับซ้อน (ตัวอักษร)")
-            max_chunks = gr.Slider(1, 40, value=5, step=1, label="จำนวนส่วนสูงสุด")
-        with gr.Row():
-            min_pairs = gr.Slider(1, 10, value=3, step=1, label="คู่ข้อมูลต่ำสุด/ส่วน")
-            max_pairs = gr.Slider(1, 12, value=6, step=1, label="คู่ข้อมูลสูงสุด/ส่วน")
-        custom_instruction = gr.Textbox(
-            label="คำสั่งกำหนดเอง (ไม่บังคับ)",
-            lines=3,
-            placeholder="แทนที่คำสั่งเริ่มต้น ต้องส่งคืน JSON array บริสุทธิ์ตามโครงสร้างงาน",
-            value="สร้างข้อมูลภาษาไทยคุณภาพสูงที่เข้าใจบริบททางวัฒนธรรมไทย ใช้ภาษาไทยที่เป็นธรรมชาติและเหมาะสมกับเนื้อหา"
-        )
-        # Task-specific controls
-        classification_labels = gr.Textbox(label="ป้ายกำกับการจำแนก (คั่นด้วยคอมมา)", visible=False)
-        multi_label = gr.Checkbox(label="อนุญาตหลายป้ายกำกับ", value=False, visible=False)
-        target_language = gr.Textbox(label="ภาษาเป้าหมาย (การแปล)", value="ไทย", visible=False)
-        num_options = gr.Slider(3, 6, value=4, step=1, label="ตัวเลือก MCQ", visible=False)
-        ner_labels = gr.Textbox(label="ป้ายกำกับ NER (คั่นด้วยคอมมา, ไม่บังคับ)", visible=False)
-    generate_btn = gr.Button("สร้างชุดข้อมูล (Generate Dataset)", variant="primary", interactive=(not effective_require_login))
     with gr.Row():
-        status = gr.Markdown()
     with gr.Row():
-        out_json = gr.File(label="ดาวน์โหลด JSON")
-        out_jsonl = gr.File(label="ดาวน์โหลด JSONL")
-    # Toggle visibility for task-specific controls
-    def _switch_task(t: str):
-        is_cls = t == "Classification"
-        is_tr = t == "Translation"
-        is_mcq = t == "MCQ"
-        is_ner = t == "NER"
-        return (
-            gr.update(visible=is_cls),  # classification_labels
-            gr.update(visible=is_cls),  # multi_label
-            gr.update(visible=is_tr),   # target_language
-            gr.update(visible=is_mcq),  # num_options
-            gr.update(visible=is_ner),  # ner_labels
-        )
-    task.change(_switch_task, inputs=task, outputs=[classification_labels, multi_label, target_language, num_options, ner_labels])
     generate_btn.click(
         fn=generate_dataset,
-        inputs=[
-            user_state,
-            pdf_files,
-            task,
-            preset_model,
-            custom_model_id,
-            hf_token,
-            chunk_size,
-            overlap,
-            max_chunks,
-            max_new_tokens,
-            temperature,
-            custom_instruction,
-            min_pairs,
-            max_pairs,
-            classification_labels,
-            multi_label,
-            target_language,
-            num_options,
-            ner_labels,
-        ],
-        outputs=[status, out_json, out_jsonl],
-        show_progress=True,
-        api_name="generate",
     )
-    if OAUTH_AVAILABLE:
-        def _on_login(user):
-            try:
-                username = None
-                if isinstance(user, dict):
-                    username = user.get("username") or user.get("name")
-                if not username and hasattr(user, "username"):
-                    username = getattr(user, "username")
-                msg = f"เข้าสู่ระบบแล้วในนาม @{username}" if username else "เข้าสู่ระบบแล้ว"
-            except Exception:
-                msg = "เข้าสู่ระบบแล้ว"
-            return user, gr.update(value=msg), gr.update(interactive=True)
-        # Enable Generate button after login and store user profile
-        if hasattr(login_btn, "login"):
-            login_btn.login(_on_login, inputs=None, outputs=[user_state, login_info, generate_btn])
-        else:
-            # In local/dev without OAuth routing, clicking will mock-login
-            login_btn.click(lambda: ("local_user", gr.update(value="เข้าสู่ระบบแล้ว (ภายในเครื่อง)"), gr.update(interactive=True)), inputs=None, outputs=[user_state, login_info, generate_btn])
 if __name__ == "__main__":
-    # For local runs
-    demo.queue().launch()

 from typing import List, Dict, Any, Tuple
 import gradio as gr
+from pypdf import PdfReader
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+# โหลดโมเดลเริ่มต้น (default)
+DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+# สร้าง pipeline global
+gen_pipe = None
+tokenizer = None
+current_model_id = None
+def load_model(model_id: str, hf_token: str = None):
+    global gen_pipe, tokenizer, current_model_id
+    if current_model_id == model_id and gen_pipe is not None:
+        return gen_pipe
+    print(f"🔄 Loading model: {model_id}")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
+    model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, device_map="auto")
+    gen_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
+    current_model_id = model_id
+    return gen_pipe
 def ensure_output_dir() -> str:
 def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
     docs = []
     combined_text_parts: List[str] = []
     for f in files:
         reader = PdfReader(path)
         pages_text = []
         for i, page in enumerate(reader.pages):
+            text = page.extract_text() or ""
             text = re.sub(r"\s+", " ", text).strip()
             if text:
                 pages_text.append({"page": i + 1, "text": text})
     return combined_text, docs
+def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200, max_chunks: int = 5) -> List[str]:
     text = text.strip()
     if not text:
         return []
+    chunks: List[str] = []
     start = 0
     n = len(text)
     while start < n and len(chunks) < max_chunks:
         end = min(start + chunk_size, n)
         chunk = text[start:end]
+        chunks.append(chunk)
         if end >= n:
             break
         start = max(end - overlap, 0)
     return chunks
+# เทมเพลต prompt พื้นฐาน
+DEFAULT_QA_PROMPT = (
+    "คุณเป็นผู้ช่วยสร้างชุดข้อมูล อ่านเนื้อหานี้แล้วสร้างคำถาม-คำตอบ "
+    "จำนวน {min_pairs} ถึ�� {max_pairs} คู่ "
+    "ส่งคืน JSON array ที่มี objects รูปแบบ {{\"question\": str, \"answer\": str}} เท่านั้น\n\n"
+    "เนื้อหา:\n{content}\n"
 )
+def generate_dataset(files: List[gr.File],
+                     task: str,
+                     preset_model: str,
+                     custom_model_id: str,
+                     hf_token: str,
+                     chunk_size: int,
+                     overlap: int,
+                     max_chunks: int,
+                     max_new_tokens: int,
+                     temperature: float,
+                     min_pairs: int,
+                     max_pairs: int):
+    if not files:
+        return "❌ กรุณาอัปโหลดไฟล์ PDF", None, None
+    # โหลดโมเดล
+    model_id = (custom_model_id or "").strip() or preset_model or DEFAULT_MODEL
+    pipe = load_model(model_id, hf_token or None)
+    # อ่าน PDF และตัดเป็น chunk
+    full_text, _ = read_pdfs(files)
+    chunks = chunk_text(full_text, chunk_size, overlap, max_chunks)
     if not chunks:
+        return "❌ ไม่สามารถดึงข้อความจาก PDF", None, None
+    results = []
     for ch in chunks:
+        prompt = DEFAULT_QA_PROMPT.format(
+            min_pairs=min_pairs,
+            max_pairs=max_pairs,
+            content=ch
+        )
+        output = pipe(prompt,
+                      max_new_tokens=max_new_tokens,
+                      temperature=temperature,
+                      do_sample=temperature > 0.0)[0]["generated_text"]
+        # พยายาม extract JSON
+        start, end = output.find("["), output.rfind("]")
+        if start != -1 and end != -1:
             try:
+                data = json.loads(output[start:end + 1])
+                if isinstance(data, list):
+                    results.extend(data)
             except Exception:
+                pass
     if not results:
+        return "❌ ไม่สามารถสร้างข้อมูล JSON ได้", None, None
+    # Save output
     outdir = ensure_output_dir()
     ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    json_path = os.path.join(outdir, f"dataset_{task}_{ts}.json")
+    jsonl_path = os.path.join(outdir, f"dataset_{task}_{ts}.jsonl")
     with io.open(json_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
     with io.open(jsonl_path, "w", encoding="utf-8") as f:
+        for item in results:
             f.write(json.dumps(item, ensure_ascii=False) + "\n")
+    return f"✅ สร้างข้อมูลสำเร็จ {len(results)} รายการ", json_path, jsonl_path
+# ---------------- Gradio UI ----------------
 PRESET_MODELS = [
+    DEFAULT_MODEL,
     "mistralai/Mistral-7B-Instruct-v0.2",
     "meta-llama/Llama-2-7b-chat-hf",
+    "google/flan-t5-large"
 ]
+with gr.Blocks(title="Thai PDF → Dataset Generator") as demo:
+    gr.Markdown("# 📚 Thai Auto Dataset Generator")
     with gr.Row():
+        pdf_files = gr.File(label="อัปโหลด PDF", file_count="multiple", file_types=[".pdf"])
     with gr.Row():
+        task = gr.Textbox(label="Task", value="QA")
+        preset_model = gr.Dropdown(label="Preset Model", choices=PRESET_MODELS, value=DEFAULT_MODEL)
+        custom_model_id = gr.Textbox(label="Custom Model ID", placeholder="org/model-name")
+        hf_token = gr.Textbox(label="HF Token", type="password")
+    with gr.Row():
+        max_new_tokens = gr.Slider(64, 1024, value=512, step=16, label="Max New Tokens")
+        temperature = gr.Slider(0.0, 1.5, value=0.3, step=0.05, label="Temperature")
     with gr.Row():
+        chunk_size = gr.Slider(500, 4000, value=1500, step=50, label="Chunk Size")
+        overlap = gr.Slider(0, 1000, value=200, step=50, label="Overlap")
+        max_chunks = gr.Slider(1, 20, value=5, step=1, label="Max Chunks")
     with gr.Row():
+        min_pairs = gr.Slider(1, 10, value=3, step=1, label="Min Pairs")
+        max_pairs = gr.Slider(1, 12, value=6, step=1, label="Max Pairs")
+    generate_btn = gr.Button("🚀 Generate Dataset")
+    status = gr.Markdown()
+    out_json = gr.File(label="JSON")
+    out_jsonl = gr.File(label="JSONL")
     generate_btn.click(
         fn=generate_dataset,
+        inputs=[pdf_files, task, preset_model, custom_model_id, hf_token,
+                chunk_size, overlap, max_chunks, max_new_tokens, temperature,
+                min_pairs, max_pairs],
+        outputs=[status, out_json, out_jsonl]
     )
 if __name__ == "__main__":
+    demo.queue().launch()