Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on Apr 16

Commit

b45e38c

1 Parent(s): 4cd7a63

add app.py and requirements.txt

Browse files

Files changed (1) hide show

app.py +109 -30

app.py CHANGED Viewed

@@ -132,49 +132,128 @@ def ner(text, model_type="bert"):
     return entities, time.time() - start_time
 # ======================== 关系抽取（RE） ========================
 def re_extract(entities, text):
-    # 修改7：添加实体类型过滤
-    valid_entity_types = {"PER", "LOC", "ORG"}
-    filtered_entities = [e for e in entities if e["type"] in valid_entity_types]
-    if len(filtered_entities) < 2:
         return []
     relations = []
-    try:
-        entity_pairs = [(e1, e2) for i, e1 in enumerate(entities) for e2 in entities[i + 1:]]
-        prompt = f"""分析文本中的实体关系，返回JSON列表：
-文本：{text}
-实体列表：{[e['text'] for e in entities]}
 要求：
-1. 仅返回存在明确关系的实体对
-2. 关系类型使用：属于、位于、参与、其他
-3. 格式示例：[{{"head": "北京", "tail": "中国", "relation": "位于"}}]"""
-        if use_chatglm:
-            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             if isinstance(response, tuple):
                 response = response[0]
-            # 提取 JSON
             try:
-                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
-                relations = json.loads(json_str)
-                # 验证关系
-                valid_relations = []
-                valid_types = {"属于", "位于", "参与", "其他"}
-                for rel in relations:
-                    if all(k in rel for k in ("head", "tail", "relation")) and rel["relation"] in valid_types:
-                        valid_relations.append(rel)
-                return valid_relations
             except Exception as e:
-                print(f"关系解析失败: {e}")
-    except Exception as e:
-        print(f"关系抽取失败: {e}")
-    # 默认不生成任何关系
-    return []
 # ======================== 文本分析主流程 ========================

     return entities, time.time() - start_time
+# ======================== 关系抽取（RE） ========================
 # ======================== 关系抽取（RE） ========================
 def re_extract(entities, text):
+    # 参数校验
+    if not entities or not text:
         return []
+    # 实体类型过滤（根据业务需求调整）
+    valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
+    filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
+    # --------------------- 处理单实体场景 ---------------------
+    if len(filtered_entities) == 1:
+        single_relations = []
+        ent = filtered_entities[0]
+        # 规则1：人物职位检测
+        if ent["type"] == "PER":
+            position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
+            for keyword in position_keywords:
+                if keyword in text:
+                    single_relations.append({
+                        "head": ent["text"],
+                        "tail": keyword,
+                        "relation": "担任职位"
+                    })
+                    break
+        # 规则2：机构地点检测
+        if ent["type"] in ["ORG", "LOC"]:
+            location_verbs = ["位于", "坐落于", "地处"]
+            for verb in location_verbs:
+                if verb in text:
+                    match = re.search(fr"{ent['text']}{verb}(.*?)[，。]", text)
+                    if match:
+                        single_relations.append({
+                            "head": ent["text"],
+                            "tail": match.group(1).strip(),
+                            "relation": "位置"
+                        })
+                    break
+        return single_relations
+    # --------------------- 多实体关系抽取 ---------------------
     relations = []
+    # 方案1：使用ChatGLM抽取关系
+    if use_chatglm and len(filtered_entities) >= 2:
+        try:
+            entity_list = [e["text"] for e in filtered_entities]
+            prompt = f"""请分析以下文本中的实体关系，严格按照JSON列表格式返回：
+文本内容：{text}
+候选实体：{entity_list}
 要求：
+1. 只返回存在明确关系的实体对
+2. 关系类型使用：属于、位于、任职于、合作、其他
+3. 示例格式：[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
+请直接返回JSON，不要多余内容："""
+            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.01)
             if isinstance(response, tuple):
                 response = response[0]
+            # 增强JSON解析
             try:
+                json_str = re.search(r'(\[.*?\])', response, re.DOTALL)
+                if json_str:
+                    json_str = json_str.group(1)
+                    json_str = re.sub(r'[\u201c\u201d]', '"', json_str)  # 处理中文引号
+                    json_str = re.sub(r'(?<!,)\n', '', json_str)  # 保留逗号后的换行
+                    relations = json.loads(json_str)
+                    # 验证关系有效性
+                    valid_relations = []
+                    valid_rel_types = {"属于", "位于", "任职于", "合作", "其他"}
+                    for rel in relations:
+                        if (isinstance(rel, dict) and
+                                rel.get("head") in entity_list and
+                                rel.get("tail") in entity_list and
+                                rel.get("relation") in valid_rel_types):
+                            valid_relations.append(rel)
+                    relations = valid_relations
             except Exception as e:
+                print(f"[DEBUG] 关系解析失败: {str(e)}")
+        except Exception as e:
+            print(f"ChatGLM关系抽取异常: {str(e)}")
+    # 方案2：规则兜底（当模型不可用或未抽取出关系时）
+    if len(relations) == 0:
+        # 规则1：A位于B
+        location_matches = re.finditer(r'([^\s，。]+)[位于|坐落于|地处]([^\s，。]+)', text)
+        for match in location_matches:
+            head, tail = match.groups()
+            relations.append({"head": head, "tail": tail, "relation": "位于"})
+        # 规则2：A属于B
+        belong_matches = re.finditer(r'([^\s，。]+)(属于|隶属于)([^\s，。]+)', text)
+        for match in belong_matches:
+            head, _, tail = match.groups()
+            relations.append({"head": head, "tail": tail, "relation": "属于"})
+        # 规则3：人物-机构关系
+        person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
+        for match in re.finditer(person_org_pattern, text):
+            head, _, tail = match.groups()
+            relations.append({"head": head, "tail": tail, "relation": "任职于"})
+    # 后处理：去重和验证
+    seen = set()
+    final_relations = []
+    for rel in relations:
+        key = (rel["head"], rel["tail"], rel["relation"])
+        if key not in seen:
+            # 验证实体是否存在
+            head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
+            tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
+            if head_exists and tail_exists:
+                final_relations.append(rel)
+                seen.add(key)
+    return final_relations
 # ======================== 文本分析主流程 ========================