Spaces:

zekun-li
/

geolm-base-toponym-recognition-demo

Sleeping

App Files Files Community

zekun-li commited on 9 days ago

Commit

465e931

verified ·

1 Parent(s): 117e039

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -6

app.py CHANGED Viewed

@@ -10,6 +10,29 @@ model.to("cpu")  # Use "cuda" if you have GPU
 model.eval()
 def get_toponym_entities(text):
     inputs = tokenizer(
         text,
@@ -17,20 +40,39 @@ def get_toponym_entities(text):
         return_tensors="pt",
         truncation=True,
         max_length=512,
     )
     offset_mapping = inputs.pop("offset_mapping")[0]
-    input_ids = inputs["input_ids"]
     with torch.no_grad():
         outputs = model(**inputs)
         predictions = torch.argmax(outputs.logits, dim=2)[0]
     entities = []
-    for idx, label_id in enumerate(predictions):
-        if label_id != 0 and idx < len(offset_mapping):
-            start, end = offset_mapping[idx].tolist()
-            if end > start:
-                entities.append({"start": start, "end": end, "entity": "Topo"})
     return {"text": text, "entities": entities}

 model.eval()
+# def get_toponym_entities(text):
+#     inputs = tokenizer(
+#         text,
+#         return_offsets_mapping=True,
+#         return_tensors="pt",
+#         truncation=True,
+#         max_length=512,
+#     )
+#     offset_mapping = inputs.pop("offset_mapping")[0]
+#     input_ids = inputs["input_ids"]
+#     with torch.no_grad():
+#         outputs = model(**inputs)
+#         predictions = torch.argmax(outputs.logits, dim=2)[0]
+#     entities = []
+#     for idx, label_id in enumerate(predictions):
+#         if label_id != 0 and idx < len(offset_mapping):
+#             start, end = offset_mapping[idx].tolist()
+#             if end > start:
+#                 entities.append({"start": start, "end": end, "entity": "Topo"})
+#     return {"text": text, "entities": entities}
 def get_toponym_entities(text):
     inputs = tokenizer(
         text,
         return_tensors="pt",
         truncation=True,
         max_length=512,
+        return_attention_mask=True,
     )
     offset_mapping = inputs.pop("offset_mapping")[0]
+    input_ids = inputs["input_ids"][0]
     with torch.no_grad():
         outputs = model(**inputs)
         predictions = torch.argmax(outputs.logits, dim=2)[0]
     entities = []
+    current_entity = None
+    for idx, (pred, offset) in enumerate(zip(predictions, offset_mapping)):
+        start, end = offset.tolist()
+        if start == end:  # skip special tokens
+            continue
+        if pred != 0:  # Non-O label
+            if current_entity is None:
+                current_entity = {"start": start, "end": end}
+            else:
+                # Extend the current entity span
+                current_entity["end"] = end
+        else:
+            if current_entity is not None:
+                current_entity["entity"] = "Topo"
+                entities.append(current_entity)
+                current_entity = None
+    # Catch any lingering entity at the end
+    if current_entity is not None:
+        current_entity["entity"] = "Topo"
+        entities.append(current_entity)
     return {"text": text, "entities": entities}