Spaces:
Running
Running
import re | |
from html import unescape | |
def clean_text(t): | |
t = clean_markdown(t) | |
t = t.replace("\n"," ") | |
t = t.replace("\t"," ") | |
t = t.replace("^M"," ") | |
t = t.replace("\r"," ") | |
t = t.replace(" ,", ",") | |
t = re.sub(" +", " ", t) | |
return t | |
def clean_markdown(md_text): | |
# Remove code blocks | |
md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL) | |
# Remove inline code | |
md_text = re.sub(r'`[^`]*`', '', md_text) | |
# Remove images | |
md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text) | |
# Remove links but keep link text | |
md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text) | |
# Remove bold and italic (groups of *, _) | |
md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text) | |
md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text) | |
# Remove headings | |
md_text = re.sub(r'#+ ', '', md_text) | |
# Remove blockquotes | |
md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE) | |
# Remove list markers | |
md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE) | |
# Remove horizontal rules | |
md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE) | |
# Remove tables | |
md_text = re.sub(r'\|.*?\|', '', md_text) | |
# Remove raw HTML tags | |
md_text = re.sub(r'<.*?>', '', md_text) | |
# Decode HTML entities | |
md_text = unescape(md_text) | |
return md_text | |