Adeptschneider commited on
Commit
18a68e7
·
1 Parent(s): 3c81ee5

Feat: DocMindAI

Browse files
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ git \
10
+ software-properties-common \
11
+ libpoppler-dev \
12
+ libmagic1 \
13
+ tesseract-ocr \
14
+ libreoffice \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Copy requirements first for better caching
18
+ COPY requirements.txt .
19
+
20
+ # Install Python dependencies
21
+ RUN pip install --no-cache-dir -r requirements.txt
22
+
23
+ # Copy the rest of the application
24
+ COPY . .
25
+
26
+ # Expose the port Streamlit runs on
27
+ EXPOSE 7860
28
+
29
+ # Command to run the application
30
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
Ingestion/__init__.py ADDED
File without changes
Ingestion/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (139 Bytes). View file
 
Ingestion/__pycache__/ingest.cpython-312.pyc ADDED
Binary file (9.71 kB). View file
 
Ingestion/ingest.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pymupdf4llm
3
+ import pandas as pd
4
+ import tempfile
5
+ from typing import Dict, Any, Optional, List
6
+
7
+ # Import unstructured components for different file types
8
+ from unstructured.partition.auto import partition
9
+ from unstructured.partition.pdf import partition_pdf
10
+ from unstructured.partition.docx import partition_docx
11
+ from unstructured.partition.pptx import partition_pptx
12
+ from unstructured.partition.xlsx import partition_xlsx
13
+ from unstructured.partition.md import partition_md
14
+ from unstructured.partition.html import partition_html
15
+ from unstructured.partition.xml import partition_xml
16
+ from unstructured.partition.email import partition_email
17
+ from unstructured.partition.text import partition_text
18
+ from unstructured.partition.epub import partition_epub
19
+
20
+ def get_processor_for_file(file_path: str) -> Optional[callable]:
21
+ """
22
+ Determine the appropriate processor function for the given file type
23
+ """
24
+ file_extension = os.path.splitext(file_path)[1].lower()
25
+
26
+ # Map file extensions to specific partition functions
27
+ processors = {
28
+ ".pdf": process_pdf,
29
+ ".docx": process_docx,
30
+ ".doc": process_docx,
31
+ ".pptx": process_pptx,
32
+ ".ppt": process_pptx,
33
+ ".xlsx": process_xlsx,
34
+ ".xls": process_xlsx,
35
+ ".md": process_markdown,
36
+ ".html": process_html,
37
+ ".htm": process_html,
38
+ ".xml": process_xml,
39
+ ".msg": process_email,
40
+ ".eml": process_email,
41
+ ".epub": process_epub,
42
+ ".txt": process_text,
43
+ ".csv": process_text,
44
+ ".rtf": process_text,
45
+
46
+ # Code files
47
+ ".py": process_text,
48
+ ".js": process_text,
49
+ ".java": process_text,
50
+ ".ts": process_text,
51
+ ".tsx": process_text,
52
+ ".jsx": process_text,
53
+ ".c": process_text,
54
+ ".cpp": process_text,
55
+ ".h": process_text,
56
+ ".cs": process_text,
57
+ ".rb": process_text,
58
+ ".go": process_text,
59
+ ".rs": process_text,
60
+ ".php": process_text,
61
+ ".sql": process_text,
62
+ ".css": process_text,
63
+ }
64
+
65
+ return processors.get(file_extension, process_generic)
66
+
67
+ def process_document(file_path: str) -> Optional[str]:
68
+ """
69
+ Process a document using the appropriate processor based on file type
70
+ """
71
+ processor = get_processor_for_file(file_path)
72
+ if processor:
73
+ return processor(file_path)
74
+ return None
75
+
76
+ def process_pdf(file_path: str) -> str:
77
+ """
78
+ Process PDF documents using unstructured
79
+ """
80
+ temp_dir = tempfile.mkdtemp()
81
+
82
+ try:
83
+ # Try hi_res mode first with OCR capabilities
84
+ elements = partition_pdf(
85
+ filename=file_path,
86
+ strategy="hi_res",
87
+ extract_images_in_pdf=True,
88
+ extract_image_block_types=["Image", "Table"],
89
+ extract_image_block_to_payload=False,
90
+ extract_image_block_output_dir=temp_dir,
91
+ hi_res_model_name="yolox",
92
+ infer_table_structure=True,
93
+ chunking_strategy="by_title",
94
+ max_characters=4000,
95
+ new_after_n_chars=3800,
96
+ combine_text_under_n_chars=2000,
97
+ )
98
+ except Exception as e:
99
+ # Fall back to fast mode if hi_res fails
100
+ elements = partition_pdf(
101
+ filename=file_path,
102
+ strategy="fast",
103
+ chunking_strategy="by_title",
104
+ max_characters=4000,
105
+ new_after_n_chars=3800,
106
+ combine_text_under_n_chars=2000,
107
+ )
108
+
109
+ # Extract text from elements
110
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
111
+ combined_text = "\n\n".join(texts)
112
+
113
+ return combined_text
114
+
115
+ def process_docx(file_path: str) -> str:
116
+ """
117
+ Process DOCX documents using unstructured
118
+ """
119
+ elements = partition_docx(
120
+ filename=file_path,
121
+ chunking_strategy="by_title",
122
+ max_characters=4000,
123
+ new_after_n_chars=3800,
124
+ combine_text_under_n_chars=2000,
125
+ )
126
+
127
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
128
+ combined_text = "\n\n".join(texts)
129
+
130
+ return combined_text
131
+
132
+ def process_pptx(file_path: str) -> str:
133
+ """
134
+ Process PPTX documents using unstructured
135
+ """
136
+ elements = partition_pptx(
137
+ filename=file_path,
138
+ )
139
+
140
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
141
+ combined_text = "\n\n".join(texts)
142
+
143
+ return combined_text
144
+
145
+ def process_xlsx(file_path: str) -> str:
146
+ """
147
+ Process XLSX documents using unstructured
148
+ """
149
+ elements = partition_xlsx(
150
+ filename=file_path,
151
+ )
152
+
153
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
154
+ combined_text = "\n\n".join(texts)
155
+
156
+ return combined_text
157
+
158
+ def process_markdown(file_path: str) -> str:
159
+ """
160
+ Process Markdown documents using unstructured
161
+ """
162
+ elements = partition_md(
163
+ filename=file_path,
164
+ )
165
+
166
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
167
+ combined_text = "\n\n".join(texts)
168
+
169
+ return combined_text
170
+
171
+ def process_html(file_path: str) -> str:
172
+ """
173
+ Process HTML documents using unstructured
174
+ """
175
+ elements = partition_html(
176
+ filename=file_path,
177
+ )
178
+
179
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
180
+ combined_text = "\n\n".join(texts)
181
+
182
+ return combined_text
183
+
184
+ def process_xml(file_path: str) -> str:
185
+ """
186
+ Process XML documents using unstructured
187
+ """
188
+ elements = partition_xml(
189
+ filename=file_path,
190
+ )
191
+
192
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
193
+ combined_text = "\n\n".join(texts)
194
+
195
+ return combined_text
196
+
197
+ def process_email(file_path: str) -> str:
198
+ """
199
+ Process email documents using unstructured
200
+ """
201
+ elements = partition_email(
202
+ filename=file_path,
203
+ )
204
+
205
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
206
+ combined_text = "\n\n".join(texts)
207
+
208
+ return combined_text
209
+
210
+ def process_text(file_path: str) -> str:
211
+ """
212
+ Process text documents using unstructured
213
+ """
214
+ elements = partition_text(
215
+ filename=file_path,
216
+ chunking_strategy="by_title",
217
+ max_characters=4000,
218
+ new_after_n_chars=3800,
219
+ combine_text_under_n_chars=2000,
220
+ )
221
+
222
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
223
+ combined_text = "\n\n".join(texts)
224
+
225
+ return combined_text
226
+
227
+ def process_epub(file_path: str) -> str:
228
+ """
229
+ Process EPUB documents using unstructured
230
+ """
231
+ elements = partition_epub(
232
+ filename=file_path,
233
+ )
234
+
235
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
236
+ combined_text = "\n\n".join(texts)
237
+
238
+ return combined_text
239
+
240
+ def process_generic(file_path: str) -> str:
241
+ """
242
+ Generic document processor using unstructured's auto partitioning
243
+ """
244
+ try:
245
+ elements = partition(
246
+ filename=file_path,
247
+ )
248
+
249
+ texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
250
+ combined_text = "\n\n".join(texts)
251
+
252
+ return combined_text
253
+ except Exception as e:
254
+ # Fall back to basic text processing if auto-partition fails
255
+ try:
256
+ with open(file_path, 'r', encoding='utf-8') as f:
257
+ return f.read()
258
+ except Exception:
259
+ # Try with a different encoding if utf-8 fails
260
+ try:
261
+ with open(file_path, 'r', encoding='latin-1') as f:
262
+ return f.read()
263
+ except Exception as e2:
264
+ raise Exception(f"Could not process file: {str(e)} / {str(e2)}")
README.md CHANGED
@@ -1,12 +1,104 @@
1
- ---
2
- title: DocMindAI
3
- emoji: 🌖
4
- colorFrom: yellow
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: DocMindAI
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DocMind AI Installation Guide
2
+
3
+ This guide will help you set up and run DocMind AI, an open-source LLM-powered document analysis application.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. [Python 3.8+](https://www.python.org/downloads/)
8
+ 2. [Ollama](https://ollama.com/) - For running local LLMs
9
+ 3. (Optional) [Docker](https://www.docker.com/) and [Docker Compose](https://docs.docker.com/compose/install/) for containerized deployment
10
+
11
+ ## Option 1: Local Installation
12
+
13
+ 1. **Clone the repository:**
14
+
15
+ ```bash
16
+ git clone https://huggingface.co/spaces/davisandshirtliff/DocMindAI
17
+ cd DocMindAI
18
+ ```
19
+
20
+ 2. **Create a virtual environment:**
21
+
22
+ ```bash
23
+ python -m venv venv
24
+ source venv/bin/activate # On Windows, use: venv\Scripts\activate
25
+ ```
26
+
27
+ 3. **Install dependencies:**
28
+
29
+ ```bash
30
+ pip install -r requirements.txt
31
+ ```
32
+
33
+ 4. **Run Ollama:**
34
+
35
+ Make sure Ollama is installed and running locally. Pull a model to use with the application:
36
+
37
+ ```bash
38
+ ollama pull gemma3:1b
39
+ ```
40
+
41
+ 5. **Run the application:**
42
+
43
+ ```bash
44
+ streamlit run app.py
45
+ ```
46
+
47
+ The application will be accessible at `http://localhost:8501` in your web browser.
48
+
49
+ ## Option 2: Docker Deployment
50
+
51
+ 1. **Clone the repository:**
52
+
53
+ ```bash
54
+ git clone https://huggingface.co/spaces/davisandshirtliff/DocMindAI
55
+ cd DocMindAI
56
+ ```
57
+
58
+ 2. **Run with Docker Compose:**
59
+
60
+ Make sure Ollama is running on your host machine, then:
61
+
62
+ ```bash
63
+ docker-compose up --build
64
+ ```
65
+
66
+ The application will be accessible at `http://localhost:8501` in your web browser.
67
+
68
+ ## Usage
69
+
70
+ 1. Enter your Ollama Base URL (default: `http://localhost:11434`)
71
+ 2. Select an Ollama model from the dropdown
72
+ 3. Upload documents for analysis
73
+ 4. Choose your analysis settings:
74
+ - Select a prompt type
75
+ - Choose a tone
76
+ - Select instructions
77
+ - Set the desired length/detail
78
+ - Choose the analysis mode
79
+ 5. Click "Extract and Analyze"
80
+ 6. Once analysis is complete, you can chat with your documents in the chat interface
81
+
82
+ ## Supported File Types
83
+
84
+ DocMind AI supports a wide range of file formats including:
85
+ - PDF
86
+ - DOCX, DOC
87
+ - TXT
88
+ - XLSX, XLS
89
+ - MD (Markdown)
90
+ - JSON
91
+ - XML
92
+ - RTF
93
+ - CSV
94
+ - MSG, EML (Email)
95
+ - PPTX, PPT (PowerPoint)
96
+ - ODT (OpenDocument Text)
97
+ - EPUB (E-book)
98
+ - Code files (PY, JS, JAVA, TS, TSX, C, CPP, H, and many more)
99
+
100
+ ## Troubleshooting
101
+
102
+ - If you encounter issues connecting to Ollama, make sure it's running and the URL is correct.
103
+ - For Docker deployment, ensure that your Docker configuration allows access to the host network.
104
+ - For document processing issues, check that you have the necessary dependencies installed.
app.py ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import tempfile
5
+ from typing import List, Optional, Dict, Any, Union
6
+ import json
7
+ from datetime import datetime
8
+ from llama_cpp import Llama
9
+ from langchain.output_parsers import PydanticOutputParser
10
+ from langchain.prompts import ChatPromptTemplate
11
+ from langchain.schema import HumanMessage, SystemMessage
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.schema.runnable import RunnablePassthrough
14
+ from langchain.prompts.prompt import PromptTemplate
15
+ from langchain.chains import ConversationalRetrievalChain
16
+ from langchain.chains import LLMChain
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.vectorstores import Chroma
19
+ from pydantic import BaseModel, Field
20
+ from Ingestion.ingest import process_document, get_processor_for_file
21
+
22
+ import warnings
23
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
24
+
25
+ # Set page configuration
26
+ st.set_page_config(
27
+ page_title="DocMind AI: AI-Powered Document Analysis",
28
+ page_icon="🧠",
29
+ layout="wide",
30
+ initial_sidebar_state="expanded",
31
+ )
32
+
33
+ # Custom CSS for better dark/light mode compatibility
34
+ st.markdown("""
35
+ <style>
36
+ /* Common styles for both modes */
37
+ .stApp {
38
+ max-width: 1200px;
39
+ margin: 0 auto;
40
+ }
41
+
42
+ /* Card styling for results */
43
+ .card {
44
+ border-radius: 5px;
45
+ padding: 1.5rem;
46
+ margin-bottom: 1rem;
47
+ border: 1px solid rgba(128, 128, 128, 0.2);
48
+ }
49
+
50
+ /* Dark mode specific */
51
+ @media (prefers-color-scheme: dark) {
52
+ .card {
53
+ background-color: rgba(255, 255, 255, 0.05);
54
+ }
55
+
56
+ .highlight-container {
57
+ background-color: rgba(255, 255, 255, 0.05);
58
+ border-left: 3px solid #4CAF50;
59
+ }
60
+
61
+ .chat-user {
62
+ background-color: rgba(0, 0, 0, 0.2);
63
+ }
64
+
65
+ .chat-ai {
66
+ background-color: rgba(76, 175, 80, 0.1);
67
+ }
68
+ }
69
+
70
+ /* Light mode specific */
71
+ @media (prefers-color-scheme: light) {
72
+ .card {
73
+ background-color: rgba(0, 0, 0, 0.02);
74
+ }
75
+
76
+ .highlight-container {
77
+ background-color: rgba(0, 0, 0, 0.03);
78
+ border-left: 3px solid #4CAF50;
79
+ }
80
+
81
+ .chat-user {
82
+ background-color: rgba(240, 240, 240, 0.7);
83
+ }
84
+
85
+ .chat-ai {
86
+ background-color: rgba(76, 175, 80, 0.05);
87
+ }
88
+ }
89
+
90
+ /* Chat message styling */
91
+ .chat-container {
92
+ margin-bottom: 1rem;
93
+ }
94
+
95
+ .chat-message {
96
+ padding: 1rem;
97
+ border-radius: 5px;
98
+ margin-bottom: 0.5rem;
99
+ }
100
+
101
+ /* Highlight sections */
102
+ .highlight-container {
103
+ padding: 1rem;
104
+ margin: 1rem 0;
105
+ border-radius: 4px;
106
+ }
107
+
108
+ /* Status indicators */
109
+ .status-success {
110
+ color: #4CAF50;
111
+ }
112
+
113
+ .status-error {
114
+ color: #F44336;
115
+ }
116
+
117
+ /* Document list */
118
+ .doc-list {
119
+ list-style-type: none;
120
+ padding-left: 0;
121
+ }
122
+
123
+ .doc-list li {
124
+ padding: 0.5rem 0;
125
+ border-bottom: 1px solid rgba(128, 128, 128, 0.2);
126
+ }
127
+ </style>
128
+ """, unsafe_allow_html=True)
129
+
130
+ # Define the output structures using Pydantic
131
+ class DocumentAnalysis(BaseModel):
132
+ summary: str = Field(description="A concise summary of the document")
133
+ key_insights: List[str] = Field(description="A list of key insights from the document")
134
+ action_items: Optional[List[str]] = Field(None, description="A list of action items derived from the document")
135
+ open_questions: Optional[List[str]] = Field(None, description="A list of open questions or areas needing clarification")
136
+
137
+ # Function to clean up LLM responses for better parsing
138
+ def clean_llm_response(response):
139
+ """Clean up the LLM response to extract JSON content from potential markdown code blocks."""
140
+ # Extract content from the response
141
+ if isinstance(response, dict) and 'choices' in response:
142
+ content = response['choices'][0]['message']['content']
143
+ else:
144
+ content = str(response)
145
+
146
+ # Remove markdown code block formatting if present
147
+ if '```' in content:
148
+ # Handle ```json format
149
+ parts = content.split('```')
150
+ if len(parts) >= 3: # Has opening and closing backticks
151
+ # Take the content between first pair of backticks
152
+ content = parts[1]
153
+ # Remove json language specifier if present
154
+ if content.startswith('json') or content.startswith('JSON'):
155
+ content = content[4:].lstrip()
156
+ elif '`json' in content:
157
+ # Handle `json format
158
+ parts = content.split('`json')
159
+ if len(parts) >= 2:
160
+ content = parts[1]
161
+ if '`' in content:
162
+ content = content.split('`')[0]
163
+
164
+ # Strip any leading/trailing whitespace
165
+ content = content.strip()
166
+
167
+ return content
168
+
169
+ # Initialize LLM and Model Cache
170
+ @st.cache_resource(experimental_allow_widgets=True)
171
+ def load_model():
172
+ with st.spinner("Loading model..."):
173
+ try:
174
+ llm = Llama.from_pretrained(
175
+ repo_id="stduhpf/google-gemma-3-1b-it-qat-q4_0-gguf-small",
176
+ filename="gemma-3-1b-it-q4_0_s.gguf",
177
+ )
178
+ return llm
179
+ except Exception as e:
180
+ st.error(f"Error loading model: {str(e)}")
181
+ return None
182
+
183
+ # Initialize embeddings - but only when needed to avoid torch inspection issues
184
+ @st.cache_resource(experimental_allow_widgets=True)
185
+ def load_embeddings():
186
+ from langchain_community.embeddings import HuggingFaceEmbeddings
187
+
188
+ with st.spinner("Loading embeddings..."):
189
+ embeddings = HuggingFaceEmbeddings(
190
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
191
+ model_kwargs={'device': 'cpu'}
192
+ )
193
+ return embeddings
194
+
195
+ # Sidebar Configuration with improved styling
196
+ st.sidebar.markdown("<div style='text-align: center;'><h1>🧠 DocMind AI</h1></div>", unsafe_allow_html=True)
197
+ st.sidebar.markdown("<div style='text-align: center;'>AI-Powered Document Analysis</div>", unsafe_allow_html=True)
198
+ st.sidebar.markdown("---")
199
+
200
+ # Load LLM
201
+ with st.sidebar:
202
+ llm = load_model()
203
+ if llm is not None:
204
+ st.markdown("<div class='status-success'>✅ Model loaded successfully!</div>", unsafe_allow_html=True)
205
+ else:
206
+ st.markdown("<div class='status-error'>❌ Error loading model. Check logs for details.</div>", unsafe_allow_html=True)
207
+ st.stop()
208
+
209
+ # Mode Selection
210
+ with st.sidebar:
211
+ st.markdown("### Analysis Configuration")
212
+ analysis_mode = st.radio(
213
+ "Analysis Mode",
214
+ ["Analyze each document separately", "Combine analysis for all documents"]
215
+ )
216
+
217
+ # Prompt Selection
218
+ prompt_options = {
219
+ "Comprehensive Document Analysis": "Analyze the provided document comprehensively. Generate a summary, extract key insights, identify action items, and list open questions.",
220
+ "Extract Key Insights and Action Items": "Extract key insights and action items from the provided document.",
221
+ "Summarize and Identify Open Questions": "Summarize the provided document and identify any open questions that need clarification.",
222
+ "Custom Prompt": "Enter a custom prompt below:"
223
+ }
224
+
225
+ with st.sidebar:
226
+ st.markdown("### Prompt Settings")
227
+ selected_prompt_option = st.selectbox("Select Prompt", list(prompt_options.keys()))
228
+ custom_prompt = ""
229
+ if selected_prompt_option == "Custom Prompt":
230
+ custom_prompt = st.text_area("Enter Custom Prompt", height=100)
231
+
232
+ # Tone Selection
233
+ tone_options = [
234
+ "Professional", "Academic", "Informal", "Creative", "Neutral",
235
+ "Direct", "Empathetic", "Humorous", "Authoritative", "Inquisitive"
236
+ ]
237
+
238
+ with st.sidebar:
239
+ selected_tone = st.selectbox("Select Tone", tone_options)
240
+
241
+ # Instructions Selection
242
+ instruction_options = {
243
+ "General Assistant": "Act as a helpful assistant.",
244
+ "Researcher": "Act as a researcher providing in-depth analysis.",
245
+ "Software Engineer": "Act as a software engineer focusing on code and technical details.",
246
+ "Product Manager": "Act as a product manager considering strategy and user experience.",
247
+ "Data Scientist": "Act as a data scientist emphasizing data analysis.",
248
+ "Business Analyst": "Act as a business analyst considering strategic aspects.",
249
+ "Technical Writer": "Act as a technical writer creating clear documentation.",
250
+ "Marketing Specialist": "Act as a marketing specialist focusing on branding.",
251
+ "HR Manager": "Act as an HR manager considering people aspects.",
252
+ "Legal Advisor": "Act as a legal advisor providing legal perspective.",
253
+ "Custom Instructions": "Enter custom instructions below:"
254
+ }
255
+
256
+ with st.sidebar:
257
+ st.markdown("### Assistant Behavior")
258
+ selected_instruction = st.selectbox("Select Instructions", list(instruction_options.keys()))
259
+ custom_instruction = ""
260
+ if selected_instruction == "Custom Instructions":
261
+ custom_instruction = st.text_area("Enter Custom Instructions", height=100)
262
+
263
+ # Length/Detail Selection
264
+ length_options = ["Concise", "Detailed", "Comprehensive", "Bullet Points"]
265
+
266
+ with st.sidebar:
267
+ st.markdown("### Response Format")
268
+ selected_length = st.selectbox("Select Length/Detail", length_options)
269
+
270
+ # Main Area
271
+ st.markdown("<h1 style='text-align: center;'>📄 DocMind AI: Document Analysis</h1>", unsafe_allow_html=True)
272
+ st.markdown("<p style='text-align: center;'>Upload documents and analyze them using the Gemma model</p>", unsafe_allow_html=True)
273
+
274
+ # File Upload with improved UI
275
+ uploaded_files = st.file_uploader(
276
+ "Upload Documents",
277
+ accept_multiple_files=True,
278
+ type=["pdf", "docx", "txt", "xlsx", "md", "json", "xml", "rtf", "csv", "msg", "pptx", "odt", "epub",
279
+ "py", "js", "java", "ts", "tsx", "c", "cpp", "h", "html", "css", "sql", "rb", "go", "rs", "php"]
280
+ )
281
+
282
+ # Display uploaded files with better visual indication
283
+ if uploaded_files:
284
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
285
+ st.markdown("### Uploaded Documents")
286
+ st.markdown("<ul class='doc-list'>", unsafe_allow_html=True)
287
+ for file in uploaded_files:
288
+ st.markdown(f"<li>📄 {file.name}</li>", unsafe_allow_html=True)
289
+ st.markdown("</ul>", unsafe_allow_html=True)
290
+ st.markdown("</div>", unsafe_allow_html=True)
291
+
292
+ # Function to process the documents and run analysis
293
+ def run_analysis():
294
+ if not uploaded_files:
295
+ st.error("Please upload at least one document.")
296
+ return
297
+
298
+ # Save uploaded files to temporary directory
299
+ temp_dir = tempfile.mkdtemp()
300
+ file_paths = []
301
+
302
+ for uploaded_file in uploaded_files:
303
+ file_path = os.path.join(temp_dir, uploaded_file.name)
304
+ with open(file_path, "wb") as f:
305
+ f.write(uploaded_file.getbuffer())
306
+ file_paths.append(file_path)
307
+
308
+ # Process documents
309
+ with st.spinner("Processing documents..."):
310
+ all_texts = []
311
+ processed_docs = []
312
+
313
+ progress_bar = st.progress(0)
314
+ for i, file_path in enumerate(file_paths):
315
+ processor = get_processor_for_file(file_path)
316
+ if processor:
317
+ try:
318
+ doc_data = process_document(file_path)
319
+ if doc_data is not None and len(doc_data.strip()) > 0: # Ensure we have content
320
+ all_texts.append(doc_data)
321
+ processed_docs.append({"name": os.path.basename(file_path), "data": doc_data})
322
+ except Exception as e:
323
+ st.error(f"Error processing {os.path.basename(file_path)}: {str(e)}")
324
+ progress_bar.progress((i + 1) / len(file_paths))
325
+
326
+ if not all_texts:
327
+ st.error("No documents could be processed. Please check the file formats and try again.")
328
+ return
329
+
330
+ # Build the prompt
331
+ if selected_prompt_option == "Custom Prompt":
332
+ prompt_text = custom_prompt
333
+ else:
334
+ prompt_text = prompt_options[selected_prompt_option]
335
+
336
+ if selected_instruction == "Custom Instructions":
337
+ instruction_text = custom_instruction
338
+ else:
339
+ instruction_text = instruction_options[selected_instruction]
340
+
341
+ # Add tone guidance
342
+ tone_guidance = f"Use a {selected_tone.lower()} tone in your response."
343
+
344
+ # Add length guidance
345
+ length_guidance = ""
346
+ if selected_length == "Concise":
347
+ length_guidance = "Keep your response brief and to the point."
348
+ elif selected_length == "Detailed":
349
+ length_guidance = "Provide a detailed response with thorough explanations."
350
+ elif selected_length == "Comprehensive":
351
+ length_guidance = "Provide a comprehensive in-depth analysis covering all aspects."
352
+ elif selected_length == "Bullet Points":
353
+ length_guidance = "Format your response primarily using bullet points for clarity."
354
+
355
+ # Set up the output parser
356
+ output_parser = PydanticOutputParser(pydantic_object=DocumentAnalysis)
357
+ format_instructions = output_parser.get_format_instructions()
358
+
359
+ if analysis_mode == "Analyze each document separately":
360
+ results = []
361
+
362
+ for doc in processed_docs:
363
+ with st.spinner(f"Analyzing {doc['name']}..."):
364
+ # Create system message with combined instructions
365
+ system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
366
+
367
+ prompt = f"""
368
+ {prompt_text}
369
+ Document: {doc['name']}
370
+ Content: {doc['data']}
371
+ """
372
+
373
+ # Get response from LLM
374
+ try:
375
+ response = llm.create_chat_completion(
376
+ messages = [
377
+ {
378
+ "role": "system",
379
+ "content": system_message
380
+ },
381
+ {
382
+ "role": "user",
383
+ "content": prompt
384
+ }
385
+ ]
386
+ )
387
+
388
+ # Try to parse the response into the pydantic model
389
+ try:
390
+ # Clean the response before parsing
391
+ cleaned_response = clean_llm_response(response)
392
+ parsed_response = output_parser.parse(cleaned_response)
393
+ results.append({
394
+ "document_name": doc['name'],
395
+ "analysis": parsed_response.dict()
396
+ })
397
+ except Exception as e:
398
+ # If parsing fails, include the raw response
399
+ if isinstance(response, dict) and 'choices' in response:
400
+ raw_response = response['choices'][0]['message']['content']
401
+ else:
402
+ raw_response = str(response)
403
+
404
+ results.append({
405
+ "document_name": doc['name'],
406
+ "analysis": raw_response,
407
+ "parsing_error": str(e)
408
+ })
409
+ except Exception as e:
410
+ st.error(f"Error analyzing {doc['name']}: {str(e)}")
411
+
412
+ # Display results with card-based UI
413
+ for result in results:
414
+ st.markdown(f"<div class='card'>", unsafe_allow_html=True)
415
+ st.markdown(f"<h3>Analysis for: {result['document_name']}</h3>", unsafe_allow_html=True)
416
+
417
+ if isinstance(result['analysis'], dict) and 'parsing_error' not in result:
418
+ # Structured output
419
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
420
+ st.markdown("### Summary")
421
+ st.write(result['analysis']['summary'])
422
+ st.markdown("</div>", unsafe_allow_html=True)
423
+
424
+ st.markdown("### Key Insights")
425
+ for insight in result['analysis']['key_insights']:
426
+ st.markdown(f"- {insight}")
427
+
428
+ if result['analysis'].get('action_items'):
429
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
430
+ st.markdown("### Action Items")
431
+ for item in result['analysis']['action_items']:
432
+ st.markdown(f"- {item}")
433
+ st.markdown("</div>", unsafe_allow_html=True)
434
+
435
+ if result['analysis'].get('open_questions'):
436
+ st.markdown("### Open Questions")
437
+ for question in result['analysis']['open_questions']:
438
+ st.markdown(f"- {question}")
439
+ else:
440
+ # Raw output
441
+ st.markdown(result['analysis'])
442
+ if 'parsing_error' in result:
443
+ st.info(f"Note: The response could not be parsed into the expected format. Error: {result['parsing_error']}")
444
+
445
+ st.markdown("</div>", unsafe_allow_html=True)
446
+
447
+ else:
448
+ with st.spinner("Analyzing all documents together..."):
449
+ # Combine all documents
450
+ combined_content = "\n\n".join([f"Document: {doc['name']}\n\nContent: {doc['data']}" for doc in processed_docs])
451
+
452
+ # Create system message with combined instructions
453
+ system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
454
+
455
+ # Create the prompt template for HuggingFace models
456
+ prompt = f"""
457
+ {prompt_text}
458
+ {combined_content}
459
+ """
460
+
461
+ # Get response from LLM
462
+ try:
463
+ response = llm.create_chat_completion(
464
+ messages = [
465
+ {
466
+ "role": "system",
467
+ "content": system_message
468
+ },
469
+ {
470
+ "role": "user",
471
+ "content": prompt
472
+ }
473
+ ]
474
+ )
475
+
476
+ # Try to parse the response into the pydantic model
477
+ try:
478
+ # Clean the response before parsing
479
+ cleaned_response = clean_llm_response(response)
480
+ parsed_response = output_parser.parse(cleaned_response)
481
+
482
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
483
+ st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
484
+
485
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
486
+ st.markdown("### Summary")
487
+ st.write(parsed_response.summary)
488
+ st.markdown("</div>", unsafe_allow_html=True)
489
+
490
+ st.markdown("### Key Insights")
491
+ for insight in parsed_response.key_insights:
492
+ st.markdown(f"- {insight}")
493
+
494
+ if parsed_response.action_items:
495
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
496
+ st.markdown("### Action Items")
497
+ for item in parsed_response.action_items:
498
+ st.markdown(f"- {item}")
499
+ st.markdown("</div>", unsafe_allow_html=True)
500
+
501
+ if parsed_response.open_questions:
502
+ st.markdown("### Open Questions")
503
+ for question in parsed_response.open_questions:
504
+ st.markdown(f"- {question}")
505
+
506
+ st.markdown("</div>", unsafe_allow_html=True)
507
+
508
+ except Exception as e:
509
+ # If parsing fails, return the raw response
510
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
511
+ st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
512
+
513
+ # Get raw content from response
514
+ if isinstance(response, dict) and 'choices' in response:
515
+ raw_response = response['choices'][0]['message']['content']
516
+ else:
517
+ raw_response = str(response)
518
+
519
+ st.markdown(raw_response)
520
+ st.info(f"Note: The response could not be parsed into the expected format. Error: {str(e)}")
521
+ st.markdown("</div>", unsafe_allow_html=True)
522
+
523
+ except Exception as e:
524
+ st.error(f"Error analyzing documents: {str(e)}")
525
+
526
+ # Create text chunks for embeddings
527
+ with st.spinner("Setting up document chat..."):
528
+ try:
529
+ text_splitter = RecursiveCharacterTextSplitter(
530
+ chunk_size=1000,
531
+ chunk_overlap=200
532
+ )
533
+
534
+ all_chunks = []
535
+ for doc in processed_docs:
536
+ if doc['data'] and len(doc['data'].strip()) > 0: # Verify data exists and is not empty
537
+ chunks = text_splitter.split_text(doc['data'])
538
+ all_chunks.extend(chunks)
539
+
540
+ # Only create embeddings if we have chunks
541
+ if all_chunks and len(all_chunks) > 0:
542
+ # Load embeddings
543
+ embeddings = load_embeddings()
544
+
545
+ # Using 'None' as namespace to avoid unique ID issues with Chroma
546
+ vectorstore = Chroma.from_texts(
547
+ texts=all_chunks,
548
+ embedding=embeddings,
549
+ collection_name="docmind_collection",
550
+ collection_metadata={"timestamp": datetime.now().isoformat()}
551
+ )
552
+ retriever = vectorstore.as_retriever()
553
+
554
+ # Set up conversation memory
555
+ memory = ConversationBufferMemory(
556
+ memory_key="chat_history",
557
+ return_messages=True
558
+ )
559
+
560
+ # Create conversational chain
561
+ qa_chain = ConversationalRetrievalChain.from_llm(
562
+ llm=llm,
563
+ retriever=retriever,
564
+ memory=memory
565
+ )
566
+
567
+ st.session_state['qa_chain'] = qa_chain
568
+ st.session_state['chat_history'] = []
569
+
570
+ st.success("Document chat is ready! Ask questions about your documents below.")
571
+ else:
572
+ st.warning("No text chunks were created from the documents. Chat functionality is unavailable.")
573
+
574
+ except Exception as e:
575
+ st.error(f"Error setting up document chat: {str(e)}")
576
+ # For debugging purposes
577
+ st.exception(e)
578
+
579
+ # Initialize chat history
580
+ if 'chat_history' not in st.session_state:
581
+ st.session_state['chat_history'] = []
582
+
583
+ # Chat Interface with improved styling
584
+ st.markdown("---")
585
+ st.markdown("<h2 style='text-align: center;'>💬 Chat with your Documents</h2>", unsafe_allow_html=True)
586
+ st.markdown("<p style='text-align: center;'>Ask follow-up questions about the analyzed documents.</p>", unsafe_allow_html=True)
587
+
588
+ # Process the analysis if button is clicked
589
+ col1, col2, col3 = st.columns([1, 2, 1])
590
+ with col2:
591
+ if st.button("Extract and Analyze", use_container_width=True):
592
+ run_analysis()
593
+
594
+ # Chat input and display
595
+ if 'qa_chain' in st.session_state:
596
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
597
+ user_question = st.text_input("Ask a question about your documents:")
598
+
599
+ if user_question:
600
+ with st.spinner("Generating response..."):
601
+ try:
602
+ response = st.session_state['qa_chain'].invoke({"question": user_question})
603
+ st.session_state['chat_history'].append({"question": user_question, "answer": response['answer']})
604
+ except Exception as e:
605
+ st.error(f"Error generating response: {str(e)}")
606
+
607
+ # Display chat history with improved styling
608
+ for exchange in st.session_state['chat_history']:
609
+ st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
610
+ st.markdown(f"<div class='chat-message chat-user'><strong>You:</strong> {exchange['question']}</div>", unsafe_allow_html=True)
611
+ st.markdown(f"<div class='chat-message chat-ai'><strong>DocMind AI:</strong> {exchange['answer']}</div>", unsafe_allow_html=True)
612
+ st.markdown("</div>", unsafe_allow_html=True)
613
+ st.markdown("</div>", unsafe_allow_html=True)
614
+
615
+ # Footer
616
+ st.markdown("---")
617
+ st.markdown(
618
+ """
619
+ <div style="text-align: center">
620
+ <p>Built with ❤️ using Streamlit, LangChain, and Gemma model</p>
621
+ <p>DocMind AI - AI-Powered Document Analysis</p>
622
+ </div>
623
+ """,
624
+ unsafe_allow_html=True
625
+ )
docker-compose.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ docmind:
5
+ build: .
6
+ ports:
7
+ - "8501:8501"
8
+ volumes:
9
+ - .:/app
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pydantic
3
+ langchain
4
+ langchain-community
5
+ unstructured
6
+ unstructured-inference
7
+ pdf2image
8
+ pytesseract
9
+ pandas
10
+ chromadb
11
+ sentence-transformers
12
+ python-docx
13
+ pymupdf4llm
14
+ llama-cpp-python
15
+ lxml
16
+ python-pptx
17
+ pdfminer.six
18
+ pillow