Dede16 commited on
Commit
416509f
·
1 Parent(s): 9cf3f78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -45
app.py CHANGED
@@ -14,57 +14,153 @@ import base64
14
  import random
15
  from PIL import Image
16
  import gradio as gr
 
17
 
18
- def portal_scraping(show_more_count):
19
- keyword = "ai"
20
- url = 'https://www.cxnetwork.com/'
 
21
 
22
- options = webdriver.ChromeOptions()
23
- options.add_argument('--headless')
24
- options.add_argument('--no-sandbox')
25
- options.add_argument('--disable-dev-shm-usage')
26
-
27
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
28
- options.add_argument(f"user-agent={user_agent}")
29
 
30
- wd = webdriver.Chrome(options=options)
31
- wd.get(url+"search/?q="+keyword)
32
 
33
- # Fungsi Untuk Menentukan Berapa Kali Menekan Tombol "Show More" Sesuai Dengan Keinginan User
34
- def click_show_more():
35
- wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
36
- time.sleep(3)
37
- try:
38
- elem = wd.find_element(By.CSS_SELECTOR, "span[class='semibold']")
39
- elem.click()
40
- time.sleep(15)
41
- except:
42
- print("error")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- for i in range(int(show_more_count)):
45
- click_show_more()
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
48
- time.sleep(1)
 
 
 
 
 
 
 
49
 
50
- raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
51
- wd.quit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- soup_html = BeautifulSoup(raw_html, "html.parser")
54
- # Membuat Variabel Yang digunakan Untuk Menyimpan URL Yang Berhasil discraping
55
- portal = []
 
56
 
57
- containers = soup_html.findAll('div', attrs={'class':'mt-2 ml-3 mr-3'})
58
- for container in containers:
59
- link = container.findAll('p')
60
- for links in link:
61
- url = links.findAll('a')
62
- for urls in url:
63
- href = urls.get('href')
64
- portal.append(href)
 
 
 
 
 
 
 
65
 
66
- return portal
67
-
68
  with gr.Blocks(theme = "soft", title="Article Writer") as article_generator:
69
  gr.Markdown(
70
  """
@@ -72,11 +168,14 @@ with gr.Blocks(theme = "soft", title="Article Writer") as article_generator:
72
  Buat artikel yang anda inginkan dengan mudah, cukup menuliskan keyword atau topik artikel yang ingin anda buat dan dalam sekejap artikel tersebut menjadi milik anda.
73
  """)
74
  with gr.Row():
75
- inputs = gr.Textbox(placeholder="Keyword/Topik", show_label=False)
76
  with gr.Row():
77
- button_scrap = gr.Button("Scrap")
78
  with gr.Row():
79
- outputs = gr.Textbox("", label="URL")
80
- button_scrap.click(fn=portal_scraping, inputs=inputs, outputs=outputs)
 
 
 
81
  if __name__ == "__main__":
82
  article_generator.launch(share=False)
 
14
  import random
15
  from PIL import Image
16
  import gradio as gr
17
+ openai.api_key ='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'
18
 
19
+ def split_article(article_text):
20
+ words = article_text.split()
21
+ total_words = len(words)
22
+ split_points = [total_words // 4, total_words // 2, (3 * total_words) // 4]
23
 
24
+ first_quarter = ' '.join(words[:split_points[0]])
25
+ second_quarter = ' '.join(words[split_points[0]:split_points[1]])
26
+ third_quarter = ' '.join(words[split_points[1]:split_points[2]])
27
+ fourth_quarter = ' '.join(words[split_points[2]:])
 
 
 
28
 
29
+ return first_quarter, second_quarter, third_quarter, fourth_quarter
 
30
 
31
+ def clean_scrap(artikel,url):
32
+ new_artikel = []
33
+ article = []
34
+ if len(artikel) > 1:
35
+ for art in artikel:
36
+ response = openai.ChatCompletion.create(
37
+ model ="gpt-3.5-turbo",
38
+ messages=[
39
+ {"role": "system", "content": "You are a very professional article editor."},
40
+ {"role": "user", "content": "I have a raw article that contains a lot of unnecessary data such as ads, website information, and article publishers, as well as links to other pages, and so on. Please clean up the article I provided so that only the article's content remains. \nThen, you should also summarize the article so that it does not exceed 5000 characters" + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: {the cleaned and summarized article's content}"}
41
+ ],
42
+ temperature = 0.1
43
+ )
44
+ finish_reason = response['choices'][0]['finish_reason']
45
+ if finish_reason == 'length' or finish_reason == 'stop':
46
+ result = response['choices'][0]['message']['content']
47
+ new_artikel.append(result)
48
+ else:
49
+ with open(file_path, 'a') as file:
50
+ file.write(url + '\n')
51
+ return None,None,None,None
52
+ else:
53
+ for art in artikel:
54
+ response = openai.ChatCompletion.create(
55
+ model ="gpt-3.5-turbo",
56
+ messages=[
57
+ {"role": "system", "content": "You are a very professional article editor."},
58
+ {"role": "user", "content": "I have a raw article that contains a lot of unnecessary data such as ads, website information, and article publishers, as well as links to other pages, and so on. Please clean up the article I provided so that only the article's content remains." + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: {the cleaned article's content}"}
59
+ ],
60
+ temperature = 0.1
61
+ )
62
+ finish_reason = response['choices'][0]['finish_reason']
63
+ if finish_reason == 'length' or finish_reason == 'stop':
64
+ result = response['choices'][0]['message']['content']
65
+ new_artikel.append(result)
66
+ else:
67
+ with open(file_path, 'a') as file:
68
+ file.write(url + '\n')
69
+ return None,None,None,None
70
+
71
+ new_art = [' '.join(new_artikel)]
72
+ for art in new_art:
73
+ response = openai.ChatCompletion.create(
74
+ model ="gpt-3.5-turbo",
75
+ messages=[
76
+ {"role": "system", "content": "You are a very professional article editor and capable of generating compelling and professional article titles."},
77
+ {"role": "user", "content": "Paraphrase the above article to make it a well-written and easily understandable piece for humans, following the conventions of renowned articles. \nThen, You Must Generate a title that is appropriate for the article I provided. The title should be professional, similar to typical article titles and sound more natural for a human to read" + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: title:{title}, article: {new paraphrased article}"}
78
+ ],
79
+ temperature = 0.1
80
+ )
81
+ finish_reason = response['choices'][0]['finish_reason']
82
+ if finish_reason == 'length' or finish_reason == 'stop':
83
+ result = response['choices'][0]['message']['content']
84
+ article.append(result)
85
+ else:
86
+ with open(file_path, 'a') as file:
87
+ file.write(url + '\n')
88
+ return None,None,None,None
89
 
90
+ content = article[0].split("\n")
91
+ title = content[0].replace('title:', '').strip()
92
+ response = openai.ChatCompletion.create(
93
+ model ="gpt-3.5-turbo",
94
+ messages=[
95
+ {"role": "system", "content": "You are a professional translator and rewriter"},
96
+ {"role": "user", "content": "Please translate and rewrite this sentence into Indonesian language with the following requirements: \n1. The sentence should be concise, compact, and clear. \n2. The sentence length should not exceed 50 characters. \n3. The sentences should be professional, similar to typical article titles and sound more natural for a human to read.:" +title+"\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: Judul:{hasil rewrite}"}
97
+ ],
98
+ temperature = 0
99
+ )
100
+ judul = response['choices'][0]['message']['content']
101
+ judul = judul.replace("Judul:", '').strip()
102
+ judul = judul.replace("Title:", '').strip()
103
 
104
+ contents = content[1:]
105
+ contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
106
+ len_contents = len(contents[0])
107
+ if len_contents > 2000:
108
+ return title, judul, url, content
109
+ else:
110
+ with open(file_path, 'a') as file:
111
+ file.write(url + '\n')
112
+ return None,None,None,None
113
 
114
+ def scrap_artikel(alamat):
115
+ artikel = []
116
+ link = alamat
117
+ for url in link:
118
+ if cek_url(url):
119
+ continue
120
+ else:
121
+ if len(artikel) >=1:
122
+ continue
123
+ options = webdriver.ChromeOptions()
124
+ options.add_argument('--headless')
125
+ options.add_argument('--no-sandbox')
126
+ options.add_argument('--disable-dev-shm-usage')
127
+
128
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
129
+ options.add_argument(f"user-agent={user_agent}")
130
+
131
+ wd = webdriver.Chrome(options=options)
132
+ wd.get(url+"search/?q="+keyword)
133
+
134
+ wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
135
+ time.sleep(1)
136
+
137
+ raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
138
+ wd.quit()
139
+
140
+ soup_html = BeautifulSoup(raw_html, "html.parser")
141
+ containers = soup_html.findAll('p')
142
 
143
+ artikel =[]
144
+ for paragraph in containers:
145
+ artic=paragraph.get_text()
146
+ artikel.append(artic)
147
 
148
+ paragraf = ' '.join(artikel)
149
+ len_paragraf = len(paragraf)
150
+ if len_paragraf >= 18000:
151
+ part1, part2, part3, part4 = split_article(paragraf)
152
+ artikels = [part1, part2, part3, part4]
153
+ else :
154
+ artikels = [paragraf]
155
+ len_artikel = len("".join(artikels))
156
+ if len_artikel > 1200 and len_artikel < 28000:
157
+ contents, url, title, judul = clean_scrap(artikels,url)
158
+ return title, judul, url, content
159
+ else:
160
+ with open(file_path, 'a') as file:
161
+ file.write(url + '\n')
162
+ return None,None,None,None
163
 
 
 
164
  with gr.Blocks(theme = "soft", title="Article Writer") as article_generator:
165
  gr.Markdown(
166
  """
 
168
  Buat artikel yang anda inginkan dengan mudah, cukup menuliskan keyword atau topik artikel yang ingin anda buat dan dalam sekejap artikel tersebut menjadi milik anda.
169
  """)
170
  with gr.Row():
171
+ inputs = gr.Textbox(placeholder="Link Article", show_label=False)
172
  with gr.Row():
173
+ button_scrap = gr.Button("Scrap Article")
174
  with gr.Row():
175
+ title = gr.Textbox("", label="Title")
176
+ judul = gr.Textbox("", label="Judul")
177
+ url = gr.Textbox("", label="URL")
178
+ content = gr.Textbox("", label="Content")
179
+ button_scrap.click(fn=scrap_artikel, inputs=inputs, outputs=[title,judul,url,content])
180
  if __name__ == "__main__":
181
  article_generator.launch(share=False)