Spaces:
Running
Running
Commit
·
b4b8d2a
1
Parent(s):
7e00fac
progress more 20..
Browse files
app.py
CHANGED
@@ -108,12 +108,14 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
108 |
def process_file(uploaded_file):
|
109 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
110 |
|
|
|
|
|
111 |
# Apply fuzzy deduplication
|
112 |
df = df.groupby('Объект').apply(
|
113 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
114 |
).reset_index(drop=True)
|
115 |
|
116 |
-
|
117 |
remaining_news_count = len(df)
|
118 |
duplicates_removed = original_news_count - remaining_news_count
|
119 |
|
|
|
108 |
def process_file(uploaded_file):
|
109 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
110 |
|
111 |
+
original_news_count = len(df)
|
112 |
+
|
113 |
# Apply fuzzy deduplication
|
114 |
df = df.groupby('Объект').apply(
|
115 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
116 |
).reset_index(drop=True)
|
117 |
|
118 |
+
|
119 |
remaining_news_count = len(df)
|
120 |
duplicates_removed = original_news_count - remaining_news_count
|
121 |
|