Update main.py
Browse files
main.py
CHANGED
@@ -90,11 +90,29 @@ def proxy(path):
|
|
90 |
|
91 |
# Check if response is HTML and filter content if needed
|
92 |
content_type = resp.headers.get('Content-Type', '')
|
93 |
-
if 'text/html' in content_type:
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# Parse HTML content
|
95 |
-
html_content = resp.content.decode('utf-8', errors='ignore')
|
96 |
soup = BeautifulSoup(html_content, 'html.parser')
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
# Filter out "Полная версия ETKA"
|
99 |
for element in soup.find_all(string=re.compile('Полная версия ETKA')):
|
100 |
# Replace the text with empty string
|
|
|
90 |
|
91 |
# Check if response is HTML and filter content if needed
|
92 |
content_type = resp.headers.get('Content-Type', '')
|
93 |
+
if 'text/html' in content_type or resp.content.strip().startswith(b'<'):
|
94 |
+
# Try to determine the correct encoding
|
95 |
+
encoding = resp.encoding or 'utf-8'
|
96 |
+
try:
|
97 |
+
html_content = resp.content.decode(encoding, errors='ignore')
|
98 |
+
except UnicodeDecodeError:
|
99 |
+
html_content = resp.content.decode('utf-8', errors='ignore')
|
100 |
# Parse HTML content
|
|
|
101 |
soup = BeautifulSoup(html_content, 'html.parser')
|
102 |
|
103 |
+
# Ensure basic HTML structure exists
|
104 |
+
if not soup.html:
|
105 |
+
html_tag = soup.new_tag('html')
|
106 |
+
soup.append(html_tag)
|
107 |
+
|
108 |
+
if not soup.html.head:
|
109 |
+
head_tag = soup.new_tag('head')
|
110 |
+
soup.html.insert(0, head_tag)
|
111 |
+
|
112 |
+
if not soup.html.body:
|
113 |
+
body_tag = soup.new_tag('body')
|
114 |
+
soup.html.append(body_tag)
|
115 |
+
|
116 |
# Filter out "Полная версия ETKA"
|
117 |
for element in soup.find_all(string=re.compile('Полная версия ETKA')):
|
118 |
# Replace the text with empty string
|