Spaces:

Rivalcoder
/

Youtube_Dockor

Runtime error

Rivalcoder commited on 6 days ago

Commit

a0bdeee

1 Parent(s): cbf58a5

Add

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -1,19 +1,35 @@
 FROM python:3.10-slim
 # Install Chrome
-RUN apt-get update && apt-get install -y wget gnupg unzip curl \
- && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
- && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \
- && apt-get update \
- && apt-get install -y google-chrome-stable
-# Install dependencies
-COPY requirements.txt /app/requirements.txt
 WORKDIR /app
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy app code
-COPY app.py /app/app.py
 # Run the app
 CMD ["python", "app.py"]

 FROM python:3.10-slim
+# Install Chrome and dependencies
+RUN apt-get update && apt-get install -y wget gnupg unzip curl ca-certificates fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1
 # Install Chrome
+RUN wget -q -O google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
+ && apt install -y ./google-chrome.deb \
+ && rm google-chrome.deb
+# Install matching ChromeDriver
+RUN CHROME_VERSION=$(google-chrome --version | grep -oP '\d+\.\d+\.\d+') \
+ && CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_VERSION") \
+ && wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip \
+ && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
+ && chmod +x /usr/local/bin/chromedriver \
+ && rm /tmp/chromedriver.zip
+# Set environment variables
+ENV PATH="/usr/local/bin:$PATH"
+ENV CHROME_BIN="/usr/bin/google-chrome"
+ENV CHROMEDRIVER="/usr/local/bin/chromedriver"
+# Set working dir
 WORKDIR /app
+# Install Python dependencies
+COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy app code
+COPY app.py .
 # Run the app
 CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -5,33 +5,34 @@ import undetected_chromedriver as uc
 def get_captions_selenium(video_url):
     try:
-        # Launch browser
         options = uc.ChromeOptions()
-        options.add_argument("--headless")
         options.add_argument("--no-sandbox")
         options.add_argument("--disable-dev-shm-usage")
         driver = uc.Chrome(options=options)
         driver.get(video_url)
-        time.sleep(5)
-        # Click "..." -> "Open transcript"
-        # YouTube UI changes often; this is just an example. May need tuning.
-        # Try to find subtitles in the page source (for auto-generated)
         page_source = driver.page_source
         if "captionTracks" in page_source:
             start = page_source.find("captionTracks")
             end = page_source.find("]", start) + 1
             caption_json = page_source[start:end]
             driver.quit()
-            return "✅ Found potential captions info in page source (you may need to parse this JSON)."
         else:
             driver.quit()
             return "⚠️ Captions info not found in source. May not be available or blocked."
     except Exception as e:
         return f"❌ Error: {str(e)}"
 # Gradio interface
@@ -40,5 +41,5 @@ gr.Interface(
     inputs=[gr.Textbox(label="YouTube Video URL")],
     outputs="text",
     title="YouTube Captions Scraper (Selenium)",
-    description="Extract captions using headless browser via Selenium."
 ).launch()

 def get_captions_selenium(video_url):
     try:
+        print("🚀 Launching Chrome...")
         options = uc.ChromeOptions()
+        options.add_argument("--headless=new")  # Use 'new' headless mode for Chrome 109+
         options.add_argument("--no-sandbox")
         options.add_argument("--disable-dev-shm-usage")
         driver = uc.Chrome(options=options)
+        print("🌍 Navigating to video...")
         driver.get(video_url)
+        print("⌛ Waiting for page to load...")
+        time.sleep(5)
+        print("📄 Scraping page source...")
         page_source = driver.page_source
         if "captionTracks" in page_source:
             start = page_source.find("captionTracks")
             end = page_source.find("]", start) + 1
             caption_json = page_source[start:end]
             driver.quit()
+            return "✅ Found potential captions info in page source (you may need to parse this JSON).\n\n" + caption_json
         else:
             driver.quit()
             return "⚠️ Captions info not found in source. May not be available or blocked."
     except Exception as e:
+        print(f"❌ Exception occurred: {e}")
         return f"❌ Error: {str(e)}"
 # Gradio interface
     inputs=[gr.Textbox(label="YouTube Video URL")],
     outputs="text",
     title="YouTube Captions Scraper (Selenium)",
+    description="Uses Selenium with undetected-chromedriver to extract captions from a YouTube video."
 ).launch()