Spaces:
Paused
Paused
File size: 7,236 Bytes
230725f 75d098a 084df26 75d098a 084df26 7908154 084df26 af5619b 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 230725f 1da8c51 084df26 230725f 084df26 230725f af5619b 084df26 af5619b 230725f 084df26 230725f 084df26 230725f 084df26 230725f 084df26 1da8c51 084df26 230725f 084df26 230725f 75d098a 230725f 084df26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import os
import io
import re
import json
from datetime import datetime
from typing import List, Dict, Any, Tuple
import gradio as gr
from pypdf import PdfReader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# โหลดโมเดลเริ่มต้น (default)
DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
# สร้าง pipeline global
gen_pipe = None
tokenizer = None
current_model_id = None
def load_model(model_id: str, hf_token: str = None):
global gen_pipe, tokenizer, current_model_id
if current_model_id == model_id and gen_pipe is not None:
return gen_pipe
print(f"🔄 Loading model: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, device_map="auto")
gen_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
current_model_id = model_id
return gen_pipe
def ensure_output_dir() -> str:
outdir = os.path.join(os.getcwd(), "outputs")
os.makedirs(outdir, exist_ok=True)
return outdir
def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
docs = []
combined_text_parts: List[str] = []
for f in files:
path = f.name if hasattr(f, "name") else f
reader = PdfReader(path)
pages_text = []
for i, page in enumerate(reader.pages):
text = page.extract_text() or ""
text = re.sub(r"\s+", " ", text).strip()
if text:
pages_text.append({"page": i + 1, "text": text})
combined_text_parts.append(text)
docs.append({"file": os.path.basename(path), "pages": pages_text})
combined_text = "\n\n".join(combined_text_parts)
return combined_text, docs
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200, max_chunks: int = 5) -> List[str]:
text = text.strip()
if not text:
return []
chunks: List[str] = []
start = 0
n = len(text)
while start < n and len(chunks) < max_chunks:
end = min(start + chunk_size, n)
chunk = text[start:end]
chunks.append(chunk)
if end >= n:
break
start = max(end - overlap, 0)
return chunks
# เทมเพลต prompt พื้นฐาน
DEFAULT_QA_PROMPT = (
"คุณเป็นผู้ช่วยสร้างชุดข้อมูล อ่านเนื้อหานี้แล้วสร้างคำถาม-คำตอบ "
"จำนวน {min_pairs} ถึง {max_pairs} คู่ "
"ส่งคืน JSON array ที่มี objects รูปแบบ {{\"question\": str, \"answer\": str}} เท่านั้น\n\n"
"เนื้อหา:\n{content}\n"
)
def generate_dataset(files: List[gr.File],
task: str,
preset_model: str,
custom_model_id: str,
hf_token: str,
chunk_size: int,
overlap: int,
max_chunks: int,
max_new_tokens: int,
temperature: float,
min_pairs: int,
max_pairs: int):
if not files:
return "❌ กรุณาอัปโหลดไฟล์ PDF", None, None
# โหลดโมเดล
model_id = (custom_model_id or "").strip() or preset_model or DEFAULT_MODEL
pipe = load_model(model_id, hf_token or None)
# อ่าน PDF และตัดเป็น chunk
full_text, _ = read_pdfs(files)
chunks = chunk_text(full_text, chunk_size, overlap, max_chunks)
if not chunks:
return "❌ ไม่สามารถดึงข้อความจาก PDF", None, None
results = []
for ch in chunks:
prompt = DEFAULT_QA_PROMPT.format(
min_pairs=min_pairs,
max_pairs=max_pairs,
content=ch
)
output = pipe(prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=temperature > 0.0)[0]["generated_text"]
# พยายาม extract JSON
start, end = output.find("["), output.rfind("]")
if start != -1 and end != -1:
try:
data = json.loads(output[start:end + 1])
if isinstance(data, list):
results.extend(data)
except Exception:
pass
if not results:
return "❌ ไม่สามารถสร้างข้อมูล JSON ได้", None, None
# Save output
outdir = ensure_output_dir()
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
json_path = os.path.join(outdir, f"dataset_{task}_{ts}.json")
jsonl_path = os.path.join(outdir, f"dataset_{task}_{ts}.jsonl")
with io.open(json_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
with io.open(jsonl_path, "w", encoding="utf-8") as f:
for item in results:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
return f"✅ สร้างข้อมูลสำเร็จ {len(results)} รายการ", json_path, jsonl_path
# ---------------- Gradio UI ----------------
PRESET_MODELS = [
DEFAULT_MODEL,
"mistralai/Mistral-7B-Instruct-v0.2",
"meta-llama/Llama-2-7b-chat-hf",
"google/flan-t5-large"
]
with gr.Blocks(title="Thai PDF → Dataset Generator") as demo:
gr.Markdown("# 📚 Thai Auto Dataset Generator")
with gr.Row():
pdf_files = gr.File(label="อัปโหลด PDF", file_count="multiple", file_types=[".pdf"])
with gr.Row():
task = gr.Textbox(label="Task", value="QA")
preset_model = gr.Dropdown(label="Preset Model", choices=PRESET_MODELS, value=DEFAULT_MODEL)
custom_model_id = gr.Textbox(label="Custom Model ID", placeholder="org/model-name")
hf_token = gr.Textbox(label="HF Token", type="password")
with gr.Row():
max_new_tokens = gr.Slider(64, 1024, value=512, step=16, label="Max New Tokens")
temperature = gr.Slider(0.0, 1.5, value=0.3, step=0.05, label="Temperature")
with gr.Row():
chunk_size = gr.Slider(500, 4000, value=1500, step=50, label="Chunk Size")
overlap = gr.Slider(0, 1000, value=200, step=50, label="Overlap")
max_chunks = gr.Slider(1, 20, value=5, step=1, label="Max Chunks")
with gr.Row():
min_pairs = gr.Slider(1, 10, value=3, step=1, label="Min Pairs")
max_pairs = gr.Slider(1, 12, value=6, step=1, label="Max Pairs")
generate_btn = gr.Button("🚀 Generate Dataset")
status = gr.Markdown()
out_json = gr.File(label="JSON")
out_jsonl = gr.File(label="JSONL")
generate_btn.click(
fn=generate_dataset,
inputs=[pdf_files, task, preset_model, custom_model_id, hf_token,
chunk_size, overlap, max_chunks, max_new_tokens, temperature,
min_pairs, max_pairs],
outputs=[status, out_json, out_jsonl]
)
if __name__ == "__main__":
demo.queue().launch() |