FROM python:3.10-slim WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y \ build-essential \ curl \ git \ software-properties-common \ libpoppler-dev \ libmagic1 \ tesseract-ocr \ libreoffice \ && rm -rf /var/lib/apt/lists/* # Copy requirements first for better caching COPY requirements.txt . # Install Python dependencies RUN pip install --no-cache-dir -r requirements.txt # Pre-download NLTK data RUN python -m nltk.downloader punkt averaged_perceptron_tagger stopwords words wordnet omw-1.4 # Create directory for NLTK data with proper permissions RUN mkdir -p /usr/local/share/nltk_data && chmod 777 /usr/local/share/nltk_data # Copy the rest of the application COPY . . # Set environment variable to specify NLTK data path ENV NLTK_DATA=/usr/local/share/nltk_data # Expose the port Streamlit runs on EXPOSE 7860 # Command to run the application CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]