Spaces:
Running
Running
Initial space app
Browse files- .gitattributes +2 -0
- README.md +4 -4
- app.py +165 -0
- index.tsv +3 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
1 |
+
index.tsv filter=lfs diff=lfs merge=lfs -text
|
2 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
3 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
34 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
index.tsv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Waqfeya
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
1 |
---
|
2 |
title: Waqfeya
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: pink
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.9.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import json
|
3 |
+
import urllib.parse
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from fuzzywuzzy import fuzz
|
10 |
+
from pyarabic.araby import strip_tashkeel
|
11 |
+
|
12 |
+
|
13 |
+
def main():
|
14 |
+
with gr.Blocks(
|
15 |
+
theme=gr.themes.Default(font=[gr.themes.GoogleFont('Noto Sans Arabic'), 'Arial', 'sans-serif']),
|
16 |
+
css='\n'.join([
|
17 |
+
'html, body, .gradio-container { direction: rtl !important; }',
|
18 |
+
'h1 { text-align: center; display: block; }',
|
19 |
+
'th, td { text-align: right !important; }',
|
20 |
+
'th span { white-space: nowrap !important; }',
|
21 |
+
'.icon-wrap { right: unset !important; left: var(--size-3) !important; }',
|
22 |
+
])
|
23 |
+
) as demo:
|
24 |
+
index_state = gr.State()
|
25 |
+
results_data = gr.State()
|
26 |
+
|
27 |
+
gr.Markdown('# ابحث في كتب المكتبة الوقفية 📚', rtl=True)
|
28 |
+
|
29 |
+
title = gr.Textbox(label='عنوان الكتاب', placeholder='اكتب عنوان الكتاب', lines=1, rtl=True)
|
30 |
+
|
31 |
+
with gr.Row():
|
32 |
+
category = gr.Dropdown(choices=['جارٍ التحميل...'], label='التصنيف (اختياري)', interactive=False)
|
33 |
+
author = gr.Dropdown(choices=['جارٍ التحميل...'], label='المؤلف (اختياري)', interactive=False)
|
34 |
+
|
35 |
+
search_button = gr.Button('ابحث')
|
36 |
+
|
37 |
+
gr.Markdown('## النتائج 🎯', rtl=True)
|
38 |
+
|
39 |
+
results = gr.Dataframe(headers=['#', 'العنوان', 'المؤلف', 'التصنيف', 'درجة التطابق'], interactive=False)
|
40 |
+
|
41 |
+
download_label = gr.Markdown('### تحميل ملفات الكتاب 📥', visible=False, rtl=True)
|
42 |
+
with gr.Tabs(visible=False) as details_box:
|
43 |
+
with gr.Tab('PDF'):
|
44 |
+
pdf_tab = gr.Markdown(rtl=True)
|
45 |
+
with gr.Tab('TXT'):
|
46 |
+
txt_tab = gr.Markdown(rtl=True)
|
47 |
+
with gr.Tab('DOCX'):
|
48 |
+
docx_tab = gr.Markdown(rtl=True)
|
49 |
+
|
50 |
+
def load_data():
|
51 |
+
_index = load_index()
|
52 |
+
_categories = get_categories(_index)
|
53 |
+
_authors = get_authors(_index)
|
54 |
+
|
55 |
+
return (
|
56 |
+
_index,
|
57 |
+
gr.update(choices=_categories, value=_categories[0], interactive=True),
|
58 |
+
gr.update(choices=_authors, value=_authors[0], interactive=True),
|
59 |
+
)
|
60 |
+
|
61 |
+
def show_details(evt: gr.SelectData, index_state, results_data):
|
62 |
+
book_details = index_state[results_data[evt.index[0]][0] - 1]
|
63 |
+
|
64 |
+
pdf_paths = generate_download_url(book_details[-4])
|
65 |
+
txt_paths = generate_download_url(book_details[-3])
|
66 |
+
docx_paths = generate_download_url(book_details[-2])
|
67 |
+
|
68 |
+
return [
|
69 |
+
gr.update(visible=True, value=pdf_paths),
|
70 |
+
gr.update(visible=True, value=txt_paths),
|
71 |
+
gr.update(visible=True, value=docx_paths),
|
72 |
+
gr.update(visible=True),
|
73 |
+
gr.update(visible=True),
|
74 |
+
]
|
75 |
+
|
76 |
+
search_button.click(
|
77 |
+
fn=lambda t, c, a, idx: handle_search(idx, t, c, a),
|
78 |
+
inputs=[title, category, author, index_state],
|
79 |
+
outputs=[results, results_data, details_box],
|
80 |
+
)
|
81 |
+
|
82 |
+
results.select(
|
83 |
+
fn=show_details,
|
84 |
+
inputs=[index_state, results_data],
|
85 |
+
outputs=[pdf_tab, txt_tab, docx_tab, details_box, download_label],
|
86 |
+
)
|
87 |
+
|
88 |
+
demo.load(load_data, outputs=[index_state, category, author])
|
89 |
+
|
90 |
+
demo.launch()
|
91 |
+
|
92 |
+
|
93 |
+
def load_index():
|
94 |
+
with open('index.tsv', 'r', encoding='utf-8') as file:
|
95 |
+
data = list(csv.reader(file, delimiter='\t'))[1:]
|
96 |
+
|
97 |
+
for i in range(len(data)):
|
98 |
+
data[i] = [i + 1] + data[i] + [normalize_text(data[i][2])]
|
99 |
+
|
100 |
+
return data
|
101 |
+
|
102 |
+
|
103 |
+
def get_categories(index):
|
104 |
+
return [''] + sorted(set([row[1] for row in index if row[1]]))
|
105 |
+
|
106 |
+
|
107 |
+
def get_authors(index):
|
108 |
+
return [''] + sorted(set([row[2] for row in index if row[2]]))
|
109 |
+
|
110 |
+
|
111 |
+
def handle_search(index, title: str, category: str, author: str):
|
112 |
+
title = normalize_text(title)
|
113 |
+
|
114 |
+
if not title.strip():
|
115 |
+
return [[['', 'يرجى إدخال عنوان للبحث.', '', '', '']], [['', 'يرجى إدخال عنوان للبحث.', '', '', '']], gr.update(visible=False)]
|
116 |
+
|
117 |
+
filtered = index
|
118 |
+
|
119 |
+
if category and category != '':
|
120 |
+
filtered = [row for row in filtered if row[1] == category]
|
121 |
+
|
122 |
+
if author and author != '':
|
123 |
+
filtered = [row for row in filtered if row[2] == author]
|
124 |
+
|
125 |
+
scored_results = []
|
126 |
+
for row in filtered:
|
127 |
+
score = fuzz.partial_ratio(title, row[-1])
|
128 |
+
|
129 |
+
if score > 50:
|
130 |
+
scored_results.append((score, row))
|
131 |
+
|
132 |
+
if not scored_results:
|
133 |
+
return [[['', 'لم يتم العثور على نتائج مطابقة.', '', '', '']], [['', 'لم يتم العثور على نتائج مطابقة.', '', '', '']], gr.update(visible=False)]
|
134 |
+
|
135 |
+
scored_results.sort(reverse=True)
|
136 |
+
|
137 |
+
result_table = [[row[0], row[3], row[2], row[1], score] for score, row in scored_results[:100]]
|
138 |
+
|
139 |
+
return [result_table, result_table, gr.update(visible=False)]
|
140 |
+
|
141 |
+
|
142 |
+
def generate_download_url(paths):
|
143 |
+
formatted_paths = []
|
144 |
+
|
145 |
+
for path in json.loads(paths.replace("'", '"')):
|
146 |
+
encoded_path = f'https://huggingface.co/datasets/ieasybooks-org/waqfeya-library/resolve/main/{urllib.parse.quote(path[2:])}'
|
147 |
+
formatted_path = f'{encoded_path}?download=true'
|
148 |
+
formatted_paths.append(f'- [{Path(path).name}]({formatted_path})')
|
149 |
+
|
150 |
+
return '\n'.join(formatted_paths)
|
151 |
+
|
152 |
+
|
153 |
+
def normalize_text(text):
|
154 |
+
text = strip_tashkeel(text)
|
155 |
+
text = text.replace('أ', 'ا')
|
156 |
+
text = text.replace('إ', 'ا')
|
157 |
+
text = text.replace('آ', 'ا')
|
158 |
+
text = text.replace('ي', 'ى')
|
159 |
+
text = text.replace('ة', 'ه')
|
160 |
+
|
161 |
+
return text
|
162 |
+
|
163 |
+
|
164 |
+
if __name__ == '__main__':
|
165 |
+
main()
|
index.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c8c53cf6fbe2340fb6ebc3194c09c7b485d06adc627cc9d28a0c39e2c7a0df0
|
3 |
+
size 16784007
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyArabic==0.6.15
|
2 |
+
fuzzywuzzy==0.18.0
|
3 |
+
gradio==5.9.1
|
4 |
+
python-Levenshtein==0.27.1
|