rwillats commited on
Commit
c82d2ac
·
verified ·
1 Parent(s): fecb216

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. hate_speech_demo.py +80 -46
hate_speech_demo.py CHANGED
@@ -36,7 +36,7 @@ function openPolicyPopup() {{
36
  </script>
37
  """
38
 
39
- # Replace your process_retrieval_text function with this updated version
40
  def process_retrieval_text(retrieval_text, user_input):
41
  """
42
  Process the retrieval text by identifying proper document boundaries
@@ -67,16 +67,41 @@ def process_retrieval_text(retrieval_text, user_input):
67
  content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
68
  content = content_match.group(1).strip() if content_match else "No content available"
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Format with clear section headers and better spacing
71
  formatted_html = f"""
72
  <div class='doc-section'>
73
  <h3 class="doc-number">Evidence Section {i+1}</h3>
74
 
75
  <div class="doc-section-info">
76
- <p><strong>Section Title:</strong> {doc_name}</p>
77
- <div class="subsection-info">
78
- <p><strong>Subsection:</strong> Page {page}</p>
79
- </div>
80
  </div>
81
 
82
  <div class="doc-content-container">
@@ -88,37 +113,57 @@ def process_retrieval_text(retrieval_text, user_input):
88
  chunks.append(formatted_html)
89
  else:
90
  # Fallback to a simpler approach - split by double newlines
91
- # but combine any small chunks that appear to be part of the same document
92
- raw_chunks = retrieval_text.strip().split("\n\n")
93
  chunks = []
94
- current_chunk = ""
95
-
96
- for chunk in raw_chunks:
97
- # If it's a short chunk without a clear document marker, or appears to be a continuation,
98
- # append to previous chunk
99
- if (len(chunk) < 50 and not re.search(r'doc|document|evidence', chunk.lower())) or \
100
- not chunk.strip().startswith(("Doc", "Document", "Evidence", "Source", "Content")):
101
- if current_chunk:
102
- current_chunk += "\n\n" + chunk
103
- else:
104
- current_chunk = chunk
105
- else:
106
- # This looks like a new document chunk
107
- if current_chunk:
108
- chunks.append(current_chunk)
109
- current_chunk = chunk
110
-
111
- # Add the last chunk if there is one
112
- if current_chunk:
113
- chunks.append(current_chunk)
114
 
115
  # Format each chunk with better section styling
116
- chunks = [f"""
117
- <div class='doc-section'>
118
- <h3 class="doc-title">Evidence Section {i+1}</h3>
119
- <div class='doc-content'>{chunk.strip()}</div>
120
- </div>
121
- """ for i, chunk in enumerate(chunks)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # Extract keywords from user input (longer than 3 chars)
124
  keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
@@ -165,26 +210,14 @@ def process_retrieval_text(retrieval_text, user_input):
165
 
166
  .doc-section-info {
167
  margin: 10px 0;
168
- padding: 8px;
169
  background: #f5f5f5;
170
  border-radius: 4px;
171
  }
172
 
173
  .doc-section-info p {
174
  margin: 5px 0;
175
- font-size: 16px;
176
- }
177
-
178
- .subsection-info {
179
- margin-left: 15px;
180
- padding-left: 10px;
181
- border-left: 2px solid #ddd;
182
- margin-top: 5px;
183
- }
184
-
185
- .subsection-info p {
186
  font-size: 14px;
187
- color: #555;
188
  }
189
 
190
  .doc-content-container {
@@ -194,6 +227,7 @@ def process_retrieval_text(retrieval_text, user_input):
194
  .doc-content-container h4 {
195
  margin-bottom: 8px;
196
  font-size: 16px;
 
197
  }
198
 
199
  .doc-content {
 
36
  </script>
37
  """
38
 
39
+ # Replace your process_retrieval_text function with this final version
40
  def process_retrieval_text(retrieval_text, user_input):
41
  """
42
  Process the retrieval text by identifying proper document boundaries
 
67
  content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
68
  content = content_match.group(1).strip() if content_match else "No content available"
69
 
70
+ # Try to parse the document title, section and subsection from content
71
+ doc_title = "Hate Speech Policy" # Default
72
+ section_title = ""
73
+ subsection_title = ""
74
+
75
+ # Look for document title pattern
76
+ doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', content, re.DOTALL)
77
+ if doc_title_match:
78
+ doc_title = doc_title_match.group(1).strip()
79
+ # Remove this part from content
80
+ content = content.replace(doc_title_match.group(0), "").strip()
81
+
82
+ # Look for section pattern
83
+ section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', content, re.DOTALL)
84
+ if section_match:
85
+ section_title = section_match.group(1).strip()
86
+ # Remove this part from content
87
+ content = content.replace(section_match.group(0), "").strip()
88
+
89
+ # Look for subsection pattern
90
+ subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', content, re.DOTALL)
91
+ if subsection_match:
92
+ subsection_title = subsection_match.group(1).strip()
93
+ # Remove this part from content
94
+ content = content.replace(subsection_match.group(0), "").strip()
95
+
96
  # Format with clear section headers and better spacing
97
  formatted_html = f"""
98
  <div class='doc-section'>
99
  <h3 class="doc-number">Evidence Section {i+1}</h3>
100
 
101
  <div class="doc-section-info">
102
+ <p><strong>Document Title:</strong> {doc_title}</p>
103
+ <p><strong>Section:</strong> {section_title if section_title else doc_name}</p>
104
+ <p><strong>Subsection:</strong> {subsection_title if subsection_title else "Page " + page}</p>
 
105
  </div>
106
 
107
  <div class="doc-content-container">
 
113
  chunks.append(formatted_html)
114
  else:
115
  # Fallback to a simpler approach - split by double newlines
 
 
116
  chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # Format each chunk with better section styling
119
+ for i, chunk in enumerate(retrieval_text.strip().split("\n\n")):
120
+ if not chunk.strip():
121
+ continue
122
+
123
+ # Try to extract document structure if possible
124
+ doc_title = "Hate Speech Policy" # Default
125
+ section_title = ""
126
+ subsection_title = ""
127
+ content = chunk
128
+
129
+ # Look for document title pattern
130
+ doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', chunk, re.DOTALL)
131
+ if doc_title_match:
132
+ doc_title = doc_title_match.group(1).strip()
133
+ # Remove this part from content
134
+ content = content.replace(doc_title_match.group(0), "").strip()
135
+
136
+ # Look for section pattern
137
+ section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', chunk, re.DOTALL)
138
+ if section_match:
139
+ section_title = section_match.group(1).strip()
140
+ # Remove this part from content
141
+ content = content.replace(section_match.group(0), "").strip()
142
+
143
+ # Look for subsection pattern
144
+ subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', chunk, re.DOTALL)
145
+ if subsection_match:
146
+ subsection_title = subsection_match.group(1).strip()
147
+ # Remove this part from content
148
+ content = content.replace(subsection_match.group(0), "").strip()
149
+
150
+ formatted_html = f"""
151
+ <div class='doc-section'>
152
+ <h3 class="doc-title">Evidence Section {i+1}</h3>
153
+
154
+ <div class="doc-section-info">
155
+ <p><strong>Document Title:</strong> {doc_title}</p>
156
+ <p><strong>Section:</strong> {section_title if section_title else "Section " + str(i+1)}</p>
157
+ <p><strong>Subsection:</strong> {subsection_title if subsection_title else "N/A"}</p>
158
+ </div>
159
+
160
+ <div class="doc-content-container">
161
+ <h4>Content:</h4>
162
+ <div class='doc-content'>{content}</div>
163
+ </div>
164
+ </div>
165
+ """
166
+ chunks.append(formatted_html)
167
 
168
  # Extract keywords from user input (longer than 3 chars)
169
  keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
 
210
 
211
  .doc-section-info {
212
  margin: 10px 0;
213
+ padding: 10px;
214
  background: #f5f5f5;
215
  border-radius: 4px;
216
  }
217
 
218
  .doc-section-info p {
219
  margin: 5px 0;
 
 
 
 
 
 
 
 
 
 
 
220
  font-size: 14px;
 
221
  }
222
 
223
  .doc-content-container {
 
227
  .doc-content-container h4 {
228
  margin-bottom: 8px;
229
  font-size: 16px;
230
+ color: #333;
231
  }
232
 
233
  .doc-content {