Spaces:
Sleeping
Sleeping
Commit
·
b45e38c
1
Parent(s):
4cd7a63
add app.py and requirements.txt
Browse files
app.py
CHANGED
|
@@ -132,49 +132,128 @@ def ner(text, model_type="bert"):
|
|
| 132 |
return entities, time.time() - start_time
|
| 133 |
|
| 134 |
|
|
|
|
| 135 |
# ======================== 关系抽取(RE) ========================
|
| 136 |
def re_extract(entities, text):
|
| 137 |
-
#
|
| 138 |
-
|
| 139 |
-
filtered_entities = [e for e in entities if e["type"] in valid_entity_types]
|
| 140 |
-
|
| 141 |
-
if len(filtered_entities) < 2:
|
| 142 |
return []
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
relations = []
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
要求:
|
| 151 |
-
1.
|
| 152 |
-
2.
|
| 153 |
-
3.
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
| 157 |
if isinstance(response, tuple):
|
| 158 |
response = response[0]
|
| 159 |
|
| 160 |
-
#
|
| 161 |
try:
|
| 162 |
-
json_str = re.search(r'\[
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
except Exception as e:
|
| 172 |
-
print(f"关系解析失败: {e}")
|
| 173 |
-
except Exception as e:
|
| 174 |
-
print(f"关系抽取失败: {e}")
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
|
| 180 |
# ======================== 文本分析主流程 ========================
|
|
|
|
| 132 |
return entities, time.time() - start_time
|
| 133 |
|
| 134 |
|
| 135 |
+
# ======================== 关系抽取(RE) ========================
|
| 136 |
# ======================== 关系抽取(RE) ========================
|
| 137 |
def re_extract(entities, text):
|
| 138 |
+
# 参数校验
|
| 139 |
+
if not entities or not text:
|
|
|
|
|
|
|
|
|
|
| 140 |
return []
|
| 141 |
|
| 142 |
+
# 实体类型过滤(根据业务需求调整)
|
| 143 |
+
valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
|
| 144 |
+
filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
|
| 145 |
+
|
| 146 |
+
# --------------------- 处理单实体场景 ---------------------
|
| 147 |
+
if len(filtered_entities) == 1:
|
| 148 |
+
single_relations = []
|
| 149 |
+
ent = filtered_entities[0]
|
| 150 |
+
|
| 151 |
+
# 规则1:人物职位检测
|
| 152 |
+
if ent["type"] == "PER":
|
| 153 |
+
position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
|
| 154 |
+
for keyword in position_keywords:
|
| 155 |
+
if keyword in text:
|
| 156 |
+
single_relations.append({
|
| 157 |
+
"head": ent["text"],
|
| 158 |
+
"tail": keyword,
|
| 159 |
+
"relation": "担任职位"
|
| 160 |
+
})
|
| 161 |
+
break
|
| 162 |
+
|
| 163 |
+
# 规则2:机构地点检测
|
| 164 |
+
if ent["type"] in ["ORG", "LOC"]:
|
| 165 |
+
location_verbs = ["位于", "坐落于", "地处"]
|
| 166 |
+
for verb in location_verbs:
|
| 167 |
+
if verb in text:
|
| 168 |
+
match = re.search(fr"{ent['text']}{verb}(.*?)[,。]", text)
|
| 169 |
+
if match:
|
| 170 |
+
single_relations.append({
|
| 171 |
+
"head": ent["text"],
|
| 172 |
+
"tail": match.group(1).strip(),
|
| 173 |
+
"relation": "位置"
|
| 174 |
+
})
|
| 175 |
+
break
|
| 176 |
+
return single_relations
|
| 177 |
+
|
| 178 |
+
# --------------------- 多实体关系抽取 ---------------------
|
| 179 |
relations = []
|
| 180 |
+
|
| 181 |
+
# 方案1:使用ChatGLM抽取关系
|
| 182 |
+
if use_chatglm and len(filtered_entities) >= 2:
|
| 183 |
+
try:
|
| 184 |
+
entity_list = [e["text"] for e in filtered_entities]
|
| 185 |
+
prompt = f"""请分析以下文本中的实体关系,严格按照JSON列表格式返回:
|
| 186 |
+
文本内容:{text}
|
| 187 |
+
候选实体:{entity_list}
|
| 188 |
要求:
|
| 189 |
+
1. 只返回存在明确关系的实体对
|
| 190 |
+
2. 关系类型使用:属于、位于、任职于、合作、其他
|
| 191 |
+
3. 示例格式:[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
|
| 192 |
+
请直接返回JSON,不要多余内容:"""
|
| 193 |
|
| 194 |
+
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.01)
|
|
|
|
| 195 |
if isinstance(response, tuple):
|
| 196 |
response = response[0]
|
| 197 |
|
| 198 |
+
# 增强JSON解析
|
| 199 |
try:
|
| 200 |
+
json_str = re.search(r'(\[.*?\])', response, re.DOTALL)
|
| 201 |
+
if json_str:
|
| 202 |
+
json_str = json_str.group(1)
|
| 203 |
+
json_str = re.sub(r'[\u201c\u201d]', '"', json_str) # 处理中文引号
|
| 204 |
+
json_str = re.sub(r'(?<!,)\n', '', json_str) # 保留逗号后的换行
|
| 205 |
+
relations = json.loads(json_str)
|
| 206 |
+
|
| 207 |
+
# 验证关系有效性
|
| 208 |
+
valid_relations = []
|
| 209 |
+
valid_rel_types = {"属于", "位于", "任职于", "合作", "其他"}
|
| 210 |
+
for rel in relations:
|
| 211 |
+
if (isinstance(rel, dict) and
|
| 212 |
+
rel.get("head") in entity_list and
|
| 213 |
+
rel.get("tail") in entity_list and
|
| 214 |
+
rel.get("relation") in valid_rel_types):
|
| 215 |
+
valid_relations.append(rel)
|
| 216 |
+
relations = valid_relations
|
| 217 |
except Exception as e:
|
| 218 |
+
print(f"[DEBUG] 关系解析失败: {str(e)}")
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(f"ChatGLM关系抽取异常: {str(e)}")
|
| 222 |
+
|
| 223 |
+
# 方案2:规则兜底(当模型不可用或未抽取出关系时)
|
| 224 |
+
if len(relations) == 0:
|
| 225 |
+
# 规则1:A位于B
|
| 226 |
+
location_matches = re.finditer(r'([^\s,。]+)[位于|坐落于|地处]([^\s,。]+)', text)
|
| 227 |
+
for match in location_matches:
|
| 228 |
+
head, tail = match.groups()
|
| 229 |
+
relations.append({"head": head, "tail": tail, "relation": "位于"})
|
| 230 |
+
|
| 231 |
+
# 规则2:A属于B
|
| 232 |
+
belong_matches = re.finditer(r'([^\s,。]+)(属于|隶属于)([^\s,。]+)', text)
|
| 233 |
+
for match in belong_matches:
|
| 234 |
+
head, _, tail = match.groups()
|
| 235 |
+
relations.append({"head": head, "tail": tail, "relation": "属于"})
|
| 236 |
+
|
| 237 |
+
# 规则3:人物-机构关系
|
| 238 |
+
person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
|
| 239 |
+
for match in re.finditer(person_org_pattern, text):
|
| 240 |
+
head, _, tail = match.groups()
|
| 241 |
+
relations.append({"head": head, "tail": tail, "relation": "任职于"})
|
| 242 |
+
|
| 243 |
+
# 后处理:去重和验证
|
| 244 |
+
seen = set()
|
| 245 |
+
final_relations = []
|
| 246 |
+
for rel in relations:
|
| 247 |
+
key = (rel["head"], rel["tail"], rel["relation"])
|
| 248 |
+
if key not in seen:
|
| 249 |
+
# 验证实体是否存在
|
| 250 |
+
head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
|
| 251 |
+
tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
|
| 252 |
+
if head_exists and tail_exists:
|
| 253 |
+
final_relations.append(rel)
|
| 254 |
+
seen.add(key)
|
| 255 |
+
|
| 256 |
+
return final_relations
|
| 257 |
|
| 258 |
|
| 259 |
# ======================== 文本分析主流程 ========================
|