File size: 2,939 Bytes
50bf7dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
from txtai.pipeline import Textractor
from txtai.embeddings import Embeddings
import nltk
nltk.download('punkt')
  #Web Scraping
import bs4 as bs
import urllib.request
import re
# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

url = "https://cdn.pixabay.com/photo/2022/02/25/09/23/background-7033808_1280.jpg"

st.title("AIP-S³")
st.write("AI Powered Smart Search System")
st.image(url)

st.markdown('_Welecome to Question Answering System 🧠 🤖_')

a = st.sidebar.radio("SELECT -", ['PDF', 'Website'])

def my_function_pdf():
  textract = Textractor(sentences=True)

  data_lines = []
  for i in (locations_max):
    lines = textract(i)
    data_lines.append(lines)
  total_lines = []
  for i in data_lines:
    total_lines += i
  seq = embeddings.similarity(quer, total_lines)
  three_most = seq[0:3]
  indexes = []
  for i in three_most:
    indexes.append(i[0])
  for j in indexes:
    st.write(total_lines[j])

## webscrap function
def my_web():
  from txtai.pipeline import Textractor
  textract = Textractor(sentences=True)
  data_lines = []
  total_lines = []
  article_text = " "
  for i in (locations_max):
    #print(i)
    scraped_data = urllib.request.urlopen(i)
    article = scraped_data.read()
    parsed_article = bs.BeautifulSoup(article,'lxml')
    paragraphs = parsed_article.find_all('p')
    for p in paragraphs:
      article_text += p.text
    lines = textract(i)
    data_lines.append(lines)
  total_lines = []
  for i in data_lines:
    total_lines += i
  seq = embeddings.similarity(quer, total_lines)
  three_most = seq[0:3]
  indexes = []
  for i in three_most:
    indexes.append(i[0])
  for j in indexes:
    st.write(total_lines[j])




##

if a == 'PDF' :
  number = st.number_input('Insert a number of files -',value =1, step =1)
  st.write('Number of PDF files - ', number)
  st.markdown("---")
  locations_max = []
  for i in range (number) :
    loc = st.text_input('Enter the PDF path :', placeholder = 'ex- /content/drive/MyDrive/', key = i)
    locations_max.append(loc)

  # for query
  quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
  st.write('Your query is - ', quer)

  # for textraction
  if st.button('Confirm!'):
     st.write('Confirmed')
     my_function_pdf()
  else:
     st.write('')
## web
else:
  number = st.number_input('Insert a number of Links -',value =1, step =1)
  st.write('Number of web pages - ', number)  
  st.markdown("---")
  locations_max = []
  for i in range (number) :
    loc = st.text_input('Enter the URL :', placeholder = 'ex- https:\\', key = i)
    locations_max.append(loc)

  # for query
  quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
  st.write('Your query is - ', quer)
  
  if st.button('Confirm!'):
     st.write('Confirmed')
     my_web()
  else:
     st.write('')