Spaces:
Runtime error
Runtime error
Rivalcoder
commited on
Commit
Β·
a0bdeee
1
Parent(s):
cbf58a5
Add
Browse files- Dockerfile +25 -9
- app.py +10 -9
Dockerfile
CHANGED
@@ -1,19 +1,35 @@
|
|
1 |
FROM python:3.10-slim
|
2 |
|
|
|
|
|
|
|
3 |
# Install Chrome
|
4 |
-
RUN
|
5 |
-
&&
|
6 |
-
&&
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
WORKDIR /app
|
|
|
|
|
|
|
13 |
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
|
15 |
# Copy app code
|
16 |
-
COPY app.py
|
17 |
|
18 |
# Run the app
|
19 |
CMD ["python", "app.py"]
|
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
+
# Install Chrome and dependencies
|
4 |
+
RUN apt-get update && apt-get install -y wget gnupg unzip curl ca-certificates fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1
|
5 |
+
|
6 |
# Install Chrome
|
7 |
+
RUN wget -q -O google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
8 |
+
&& apt install -y ./google-chrome.deb \
|
9 |
+
&& rm google-chrome.deb
|
10 |
+
|
11 |
+
# Install matching ChromeDriver
|
12 |
+
RUN CHROME_VERSION=$(google-chrome --version | grep -oP '\d+\.\d+\.\d+') \
|
13 |
+
&& CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_VERSION") \
|
14 |
+
&& wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip \
|
15 |
+
&& unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
|
16 |
+
&& chmod +x /usr/local/bin/chromedriver \
|
17 |
+
&& rm /tmp/chromedriver.zip
|
18 |
+
|
19 |
+
# Set environment variables
|
20 |
+
ENV PATH="/usr/local/bin:$PATH"
|
21 |
+
ENV CHROME_BIN="/usr/bin/google-chrome"
|
22 |
+
ENV CHROMEDRIVER="/usr/local/bin/chromedriver"
|
23 |
+
|
24 |
+
# Set working dir
|
25 |
WORKDIR /app
|
26 |
+
|
27 |
+
# Install Python dependencies
|
28 |
+
COPY requirements.txt .
|
29 |
RUN pip install --no-cache-dir -r requirements.txt
|
30 |
|
31 |
# Copy app code
|
32 |
+
COPY app.py .
|
33 |
|
34 |
# Run the app
|
35 |
CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -5,33 +5,34 @@ import undetected_chromedriver as uc
|
|
5 |
|
6 |
def get_captions_selenium(video_url):
|
7 |
try:
|
8 |
-
|
9 |
options = uc.ChromeOptions()
|
10 |
-
options.add_argument("--headless")
|
11 |
options.add_argument("--no-sandbox")
|
12 |
options.add_argument("--disable-dev-shm-usage")
|
13 |
|
14 |
driver = uc.Chrome(options=options)
|
15 |
-
|
16 |
driver.get(video_url)
|
17 |
-
time.sleep(5)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
|
23 |
page_source = driver.page_source
|
|
|
24 |
if "captionTracks" in page_source:
|
25 |
start = page_source.find("captionTracks")
|
26 |
end = page_source.find("]", start) + 1
|
27 |
caption_json = page_source[start:end]
|
28 |
driver.quit()
|
29 |
-
return "β
Found potential captions info in page source (you may need to parse this JSON)
|
30 |
else:
|
31 |
driver.quit()
|
32 |
return "β οΈ Captions info not found in source. May not be available or blocked."
|
33 |
|
34 |
except Exception as e:
|
|
|
35 |
return f"β Error: {str(e)}"
|
36 |
|
37 |
# Gradio interface
|
@@ -40,5 +41,5 @@ gr.Interface(
|
|
40 |
inputs=[gr.Textbox(label="YouTube Video URL")],
|
41 |
outputs="text",
|
42 |
title="YouTube Captions Scraper (Selenium)",
|
43 |
-
description="
|
44 |
).launch()
|
|
|
5 |
|
6 |
def get_captions_selenium(video_url):
|
7 |
try:
|
8 |
+
print("π Launching Chrome...")
|
9 |
options = uc.ChromeOptions()
|
10 |
+
options.add_argument("--headless=new") # Use 'new' headless mode for Chrome 109+
|
11 |
options.add_argument("--no-sandbox")
|
12 |
options.add_argument("--disable-dev-shm-usage")
|
13 |
|
14 |
driver = uc.Chrome(options=options)
|
15 |
+
print("π Navigating to video...")
|
16 |
driver.get(video_url)
|
|
|
17 |
|
18 |
+
print("β Waiting for page to load...")
|
19 |
+
time.sleep(5)
|
20 |
|
21 |
+
print("π Scraping page source...")
|
22 |
page_source = driver.page_source
|
23 |
+
|
24 |
if "captionTracks" in page_source:
|
25 |
start = page_source.find("captionTracks")
|
26 |
end = page_source.find("]", start) + 1
|
27 |
caption_json = page_source[start:end]
|
28 |
driver.quit()
|
29 |
+
return "β
Found potential captions info in page source (you may need to parse this JSON).\n\n" + caption_json
|
30 |
else:
|
31 |
driver.quit()
|
32 |
return "β οΈ Captions info not found in source. May not be available or blocked."
|
33 |
|
34 |
except Exception as e:
|
35 |
+
print(f"β Exception occurred: {e}")
|
36 |
return f"β Error: {str(e)}"
|
37 |
|
38 |
# Gradio interface
|
|
|
41 |
inputs=[gr.Textbox(label="YouTube Video URL")],
|
42 |
outputs="text",
|
43 |
title="YouTube Captions Scraper (Selenium)",
|
44 |
+
description="Uses Selenium with undetected-chromedriver to extract captions from a YouTube video."
|
45 |
).launch()
|