bainskarman commited on
Commit
d3377e2
·
verified ·
1 Parent(s): 15f5963

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -116
app.py CHANGED
@@ -1,17 +1,12 @@
1
  import streamlit as st
2
  import os
3
  import requests
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langdetect import detect
9
 
10
  # Load the Hugging Face token from environment variables (secrets)
11
  token = os.environ.get("KEY2") # Replace "KEY2" with your secret key name
12
 
13
- # Initialize the Hugging Face Inference API
14
- def query_huggingface_api(prompt, max_new_tokens=200, temperature=0.7, top_k=50):
15
  model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
16
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
17
  headers = {"Authorization": f"Bearer {token}"}
@@ -30,117 +25,20 @@ def query_huggingface_api(prompt, max_new_tokens=200, temperature=0.7, top_k=50)
30
  st.error(f"Error: {response.status_code} - {response.text}")
31
  return None
32
 
33
- # Extract text from PDF
34
- def extract_text_from_pdf(file):
35
- reader = PdfReader(file)
36
- text = ""
37
- for page in reader.pages:
38
- text += page.extract_text()
39
- return text
40
-
41
- # Split text into chunks
42
- def split_text(text, chunk_size=1000, chunk_overlap=200):
43
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
44
- chunks = splitter.split_text(text)
45
- return chunks
46
-
47
- # Create embeddings and vector store
48
- def create_vector_store(chunks, indexing_method="multi-representation", **kwargs):
49
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
50
- if indexing_method == "multi-representation":
51
- vector_store = FAISS.from_texts(chunks, embeddings)
52
- elif indexing_method == "raptors":
53
- # Implement RAPTORS logic here (e.g., hierarchical chunking)
54
- vector_store = FAISS.from_texts(chunks, embeddings)
55
- elif indexing_method == "colbert":
56
- # Implement ColBERT logic here (e.g., contextualized embeddings)
57
- vector_store = FAISS.from_texts(chunks, embeddings)
58
- return vector_store
59
-
60
- # Query the PDF using the Hugging Face API
61
- def query_pdf(vector_store, query, query_method="multi-query", max_new_tokens=200, temperature=0.7, top_k=50):
62
- # Retrieve relevant chunks from the vector store
63
- docs = vector_store.similarity_search(query)
64
- context = " ".join([doc.page_content for doc in docs])
65
-
66
- # Create a prompt for the LLM
67
- prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
68
-
69
- # Query the Hugging Face API
70
- answer = query_huggingface_api(prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
71
- return answer, docs
72
-
73
- # Detect language of the text
74
- def detect_language(text):
75
- try:
76
- return detect(text)
77
- except:
78
- return "en" # Default to English if detection fails
79
-
80
  # Streamlit App
81
  def main():
82
- st.title("Chat with PDF")
83
- st.write("Upload a PDF and ask questions about it!")
84
-
85
- # File uploader
86
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
87
- if uploaded_file is None:
88
- st.info("Using default PDF.")
89
- uploaded_file = "default.pdf" # Add a default PDF
90
-
91
- # Step 1: Extract text and split into chunks
92
- if "text" not in st.session_state:
93
- st.session_state.text = None
94
- if "chunks" not in st.session_state:
95
- st.session_state.chunks = None
96
-
97
- if st.button("Extract Text and Split into Chunks"):
98
- st.session_state.text = extract_text_from_pdf(uploaded_file)
99
- st.session_state.chunks = split_text(st.session_state.text)
100
- st.success("Text extracted and split into chunks!")
101
-
102
- # Step 2: Create vector store
103
- if "vector_store" not in st.session_state:
104
- st.session_state.vector_store = None
105
-
106
- if st.session_state.chunks:
107
- st.subheader("Indexing Options")
108
- indexing_method = st.selectbox(
109
- "Indexing Method",
110
- ["multi-representation", "raptors", "colbert"],
111
- help="Choose how to index the PDF text."
112
- )
113
- if st.button("Create Vector Store"):
114
- st.session_state.vector_store = create_vector_store(st.session_state.chunks, indexing_method=indexing_method)
115
- st.success("Vector store created!")
116
-
117
- # Step 3: Query the PDF
118
- if st.session_state.vector_store:
119
- st.subheader("Query Translation Options")
120
- query_method = st.selectbox(
121
- "Query Translation Method",
122
- ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
123
- help="Choose a method to improve query retrieval."
124
- )
125
- st.subheader("LLM Parameters")
126
- temperature = st.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.")
127
- top_k = st.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.")
128
- max_new_tokens = st.slider("Max New Tokens", 50, 500, 200, help="Maximum number of tokens to generate.")
129
- query = st.text_input("Ask a question about the PDF:")
130
- if query:
131
- answer, source_docs = query_pdf(
132
- st.session_state.vector_store,
133
- query,
134
- query_method=query_method,
135
- max_new_tokens=max_new_tokens,
136
- temperature=temperature,
137
- top_k=top_k,
138
- )
139
- if answer:
140
- st.write("**Answer:**", answer)
141
- st.write("**Source Text:**")
142
- for doc in source_docs:
143
- st.write(doc.page_content)
144
 
145
  if __name__ == "__main__":
146
  main()
 
1
  import streamlit as st
2
  import os
3
  import requests
 
 
 
 
 
4
 
5
  # Load the Hugging Face token from environment variables (secrets)
6
  token = os.environ.get("KEY2") # Replace "KEY2" with your secret key name
7
 
8
+ # Function to query the Hugging Face API
9
+ def query_huggingface_api(prompt, max_new_tokens=50, temperature=0.7, top_k=50):
10
  model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
11
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
12
  headers = {"Authorization": f"Bearer {token}"}
 
25
  st.error(f"Error: {response.status_code} - {response.text}")
26
  return None
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Streamlit App
29
  def main():
30
+ st.title("Hugging Face API Test")
31
+ st.write("Enter a prompt and get a response from the model.")
32
+
33
+ # Input prompt
34
+ prompt = st.text_input("Enter your prompt:")
35
+ if prompt:
36
+ st.write("**Prompt:**", prompt)
37
+
38
+ # Query the Hugging Face API
39
+ response = query_huggingface_api(prompt)
40
+ if response:
41
+ st.write("**Response:**", response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  if __name__ == "__main__":
44
  main()