Makima57 commited on
Commit
04e30ca
·
verified ·
1 Parent(s): 2992469

Update app.py

Browse files

app to save chunked data

Files changed (1) hide show
  1. app.py +23 -27
app.py CHANGED
@@ -2,19 +2,14 @@ import streamlit as st
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import chunk # Import the chunking functionality from app2.py
 
6
 
7
- # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
- query = query + "/t site:https://medium.com/"
11
- search_results = search(query, num_results=10) # Get up to 10 results
12
- first_two_links = []
13
- for i, link in enumerate(search_results):
14
- if i < 2:
15
- first_two_links.append(link)
16
- else:
17
- break
18
  return first_two_links
19
  except Exception as e:
20
  st.error(f"An error occurred: {e}")
@@ -30,19 +25,15 @@ def fetch_webpage_content(url):
30
  st.error(f"Failed to fetch the webpage content: {e}")
31
  return None
32
 
33
- # Function to scrape text from webpage content using BeautifulSoup
34
  def scrape_text(webpage_content):
35
  try:
36
  soup = BeautifulSoup(webpage_content, 'html.parser')
37
- # Remove all script and style elements
38
  for script in soup(["script", "style"]):
39
  script.decompose()
40
  text = soup.get_text()
41
- # Break the text into lines and remove leading/trailing spaces
42
  lines = (line.strip() for line in text.splitlines())
43
- # Break multi-headlines into a line each
44
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
45
- # Drop blank lines
46
  text = '\n'.join(chunk for chunk in chunks if chunk)
47
  return text
48
  except Exception as e:
@@ -50,7 +41,7 @@ def scrape_text(webpage_content):
50
  return None
51
 
52
  # Streamlit app UI
53
- st.title("Search Link Finder")
54
 
55
  # Input field for search query
56
  query = st.text_input("Enter search query", "")
@@ -60,8 +51,8 @@ if st.button("Search"):
60
  if query:
61
  first_two_links = google_search(query)
62
  if first_two_links:
63
- for i, link in enumerate(first_two_links):
64
- st.success(f"Link {i+1}: [Click here]({link})")
65
 
66
  # Fetch webpage content
67
  webpage_content = fetch_webpage_content(link)
@@ -69,16 +60,21 @@ if st.button("Search"):
69
  # Scrape text from webpage content
70
  scraped_text = scrape_text(webpage_content)
71
  if scraped_text:
72
- st.write(f"Scraped Content from Link {i+1} (Chunked):")
73
-
74
- # Call the chunking function from app2.py
75
- chunk.display_chunks(scraped_text)
76
-
77
- # Option to download the entire scraped content
 
 
 
 
 
78
  st.download_button(
79
- label=f"Download Full Webpage Content from Link {i+1}",
80
- data=scraped_text,
81
- file_name=f"webpage_content_{i+1}.txt",
82
  mime="text/plain"
83
  )
84
  else:
 
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import chunk # Import the chunking function from chunk.py
6
+ import json
7
 
8
+ # Function to perform Google search and return the first link
9
  def google_search(query):
10
  try:
11
+ search_results = search(query, num_results=2) # Get first two results
12
+ first_two_links = [next(search_results, None), next(search_results, None)]
 
 
 
 
 
 
13
  return first_two_links
14
  except Exception as e:
15
  st.error(f"An error occurred: {e}")
 
25
  st.error(f"Failed to fetch the webpage content: {e}")
26
  return None
27
 
28
+ # Function to scrape text from webpage content using Beautiful Soup
29
  def scrape_text(webpage_content):
30
  try:
31
  soup = BeautifulSoup(webpage_content, 'html.parser')
 
32
  for script in soup(["script", "style"]):
33
  script.decompose()
34
  text = soup.get_text()
 
35
  lines = (line.strip() for line in text.splitlines())
 
36
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
37
  text = '\n'.join(chunk for chunk in chunks if chunk)
38
  return text
39
  except Exception as e:
 
41
  return None
42
 
43
  # Streamlit app UI
44
+ st.title("Search and Chunk Webpage Content")
45
 
46
  # Input field for search query
47
  query = st.text_input("Enter search query", "")
 
51
  if query:
52
  first_two_links = google_search(query)
53
  if first_two_links:
54
+ for i, link in enumerate(first_two_links, 1):
55
+ st.success(f"Link {i}: [Click here]({link})")
56
 
57
  # Fetch webpage content
58
  webpage_content = fetch_webpage_content(link)
 
60
  # Scrape text from webpage content
61
  scraped_text = scrape_text(webpage_content)
62
  if scraped_text:
63
+ # Chunk the scraped text using chunk.py
64
+ chunked_text = chunk.chunk_text(scraped_text)
65
+
66
+ # Save chunked data to a file for later use
67
+ with open("chunked_data.json", "w") as f:
68
+ json.dump(chunked_text, f)
69
+
70
+ st.write(f"Chunked Data for Link {i}:")
71
+ for chunk_part in chunked_text:
72
+ st.write(chunk_part)
73
+
74
  st.download_button(
75
+ label="Download Chunked Webpage Content",
76
+ data="\n".join(chunked_text),
77
+ file_name="chunked_webpage_content.txt",
78
  mime="text/plain"
79
  )
80
  else: