TEST-GIZ-Project-Search

Sleeping

annikwag commited on Feb 26

Commit

221e09e

verified ·

1 Parent(s): 0eaa45d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -162,20 +162,20 @@ lexical_all = results[1]
 # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
 semantic_all = [
-    r for r in semantic_all if len(r.payload["page_content"]) >= 20
 ]
 lexical_all = [
-    r for r in lexical_all if len(r.payload["page_content"]) >= 20
 ]
 # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
-semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
 # 2) Filter the entire sets
 filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter) ## , end_year_range ToDo add end_year filter again
 filtered_lexical = filter_results(lexical_all, country_filter, region_filter)## , end_year_range ToDo add end_year filter again
-filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)

 # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
 semantic_all = [
+    r for r in semantic_all if len(r.payload["page_content"]) >= 5
 ]
 lexical_all = [
+    r for r in lexical_all if len(r.payload["page_content"]) >= 5
 ]
 # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
+semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
 # 2) Filter the entire sets
 filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter) ## , end_year_range ToDo add end_year filter again
 filtered_lexical = filter_results(lexical_all, country_filter, region_filter)## , end_year_range ToDo add end_year filter again
+filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)