A web-scraping tool for verge and techradar
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

GOOGLE.pyw 16 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. import tkinter
  2. from tkinter import *
  3. from os.path import expanduser
  4. from tkinter import filedialog
  5. from tkinter.ttk import Progressbar
  6. from urllib.request import urlopen as uReq
  7. from bs4 import BeautifulSoup as soup
  8. import io
  9. from mtranslate import translate
  10. import playsound
  11. import sys
  12. import time
  13. from threading import *
  14. import os
  15. from tkinter import messagebox
  16. desktop = expanduser("~/Documents")
  17. agency = "verge"
  18. def chooseDirectory():
  19. currdir = os.getcwd()
  20. tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
  21. program.directory = tempdir
  22. def switchAgencies(agencies):
  23. print("called Agencies")
  24. if agencies == "verge":
  25. print("switching to techradar")
  26. agencies = "techradar"
  27. else:
  28. print("switching to verge")
  29. agencies = "verge"
  30. button4['text'] = agencies
  31. class Scrapers(object):
  32. def __init__(self):
  33. self.thread1 = None
  34. self.stop_threads = Event()
  35. self.stopped = False
  36. self.CloseLabel = Label(root, text = "Finalizing before breaking!")
  37. self.directory = desktop
  38. self.needToSkip = False
  39. def waitandkill(self):
  40. time.sleep(1)
  41. if (self.stopped == True):
  42. print("DEAD")
  43. else:
  44. self.waitandkill
  45. def stopTheThread(self):
  46. print("CALLED ME TOO?")
  47. self.stop_threads.set()
  48. self.CloseLabel.pack()
  49. self.waitandkill
  50. print("calling wait")
  51. def skip(self):
  52. self.needToSkip = True
  53. def start_thread(self):
  54. Skip = Button(topFrame, text = "SKIP!", command = self.skip)
  55. Skip.pack(side = BOTTOM)
  56. try:
  57. f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
  58. except IOError:
  59. print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
  60. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
  61. sys.exit()
  62. try:
  63. f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
  64. except IOError:
  65. print("FILE ERROR!" + self.directory + "/News.txt")
  66. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
  67. sys.exit()
  68. if self.thread1!=None:
  69. print("NO!")
  70. else:
  71. self.thread1 = Thread(target = self.start_now)
  72. self.thread1.start()
  73. threadActive = 1
  74. def start_now(self):
  75. print("Getting" + button4['text'])
  76. if button4['text'] == "techradar":
  77. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  78. progress['value'] = 0
  79. progress.pack(side = TOP)
  80. Labels = Label(topFrame, text = "SCRAPING")
  81. Labels.pack(side = TOP)
  82. texts = "change"
  83. main_url = 'https://www.techradar.com/news'
  84. uClient = uReq(main_url)
  85. page_html = uClient.read()
  86. uClient.close()
  87. page_soup = soup(page_html, "html.parser")
  88. containers = page_soup.findAll("div",{"class":"listingResult"})
  89. Articles = len(containers)
  90. print(Articles)
  91. filename = self.directory + "/News.txt"
  92. trans_filename = self.directory + "/TranslatedNews.txt"
  93. f = io.open(filename, "w", encoding="utf-8")
  94. f.write("ACTIVE")
  95. t = io.open(trans_filename, "w", encoding ="utf-8")
  96. t.write("ACTIVE")
  97. Labels.config(text = "setting file!")
  98. i = 0
  99. CurrentTitle = Label(topFrame, text = "Preparing...")
  100. CurrentTitle.pack(side = TOP)
  101. for container in containers:
  102. i = i + 1
  103. Labels.config(text = "jumping to URL!")
  104. print(container["class"])
  105. if 'sponsored-post' in container["class"]:
  106. print("\n WE'VE CATCHED AN AD!")
  107. continue
  108. progress['value'] = i * 100 / Articles
  109. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  110. local_progress['value'] = 0
  111. local_progress.pack(side = BOTTOM)
  112. requiredURL = container.a["href"]
  113. secondary_URL = requiredURL
  114. print("Set target URL!" + requiredURL)
  115. secClient = uReq(secondary_URL)
  116. news_html = secClient.read()
  117. secClient.close()
  118. news_soup = soup(news_html, "html.parser")
  119. news_soup.decode('utf-8', 'ignore')
  120. squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
  121. print(len(squash))
  122. if len(squash)>0:
  123. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  124. print(container.div["class"])
  125. continue
  126. news_containers = news_soup.findAll("header")
  127. if len(news_containers)>0:
  128. news_title = news_containers[0].h1.text
  129. CurrentTitle.config(text = news_title)
  130. Labels.config(text = "Extracted Title!")
  131. else:
  132. print("ERROR! NO TITLE AT "+secondary_URL)
  133. Labels.config(text = "Failed to extract title")
  134. news_body = news_soup.findAll("div", {"id":"article-body"})
  135. print("\n TITLE: " + news_title)
  136. f.write("\n \n" + news_title + "\n")
  137. print("Now translating...")
  138. translatedQuery = translate(news_title, "ru", "en")
  139. t.write("\n \n" + translatedQuery + "\n")
  140. paragraphs = news_body[0].findAll("p")
  141. print("Title Recorded!")
  142. local_progress['value'] = 10
  143. y = len(paragraphs)
  144. x = 0
  145. fullText = ""
  146. fullText2 = ""
  147. for paragraph in paragraphs:
  148. x = x + 1
  149. local_progress['value'] = x * 100 / y + 10
  150. stringx = str(x)
  151. Labels.config(text = "Getting paragraph " + stringx + "...")
  152. print(paragraph.text + "\n \n \n")
  153. if x >= y/2:
  154. fullText2 = fullText2 + paragraph.text.strip()
  155. else:
  156. fullText = fullText + paragraph.text.strip()
  157. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  158. print("Writing Paragraph " + stringx + "...")
  159. if self.needToSkip:
  160. break
  161. if self.needToSkip:
  162. self.needToSkip = False
  163. continue
  164. translatedQuery = translate(fullText, "ru", "en")
  165. completeText = translatedQuery
  166. translatedQuery = translate(fullText2, "ru", "en")
  167. completeText = completeText + translatedQuery
  168. f.write("\n" + fullText + fullText2)
  169. t.write("\n" + completeText)
  170. news_picture = news_soup.findAll("source", {"class":"hero-image"})
  171. Labels.config(text = "Getting image...")
  172. if len(news_picture) > 0:
  173. article_pic = news_picture[0].get("data-original-mos")
  174. Labels.config(text = "Picture recieved!")
  175. else:
  176. print("\n THIS ARTICLE HAS NO PICTURE! ")
  177. Labels.config(text = "Failed to locate picture :(")
  178. local_progress['value'] = 120
  179. f.write("\n PICTURE URL: " + article_pic)
  180. t.write("\n PICTURE URL: " + article_pic)
  181. if self.stop_threads.is_set():
  182. print("I SURRENDER!")
  183. self.stopped = True
  184. f.close()
  185. t.close()
  186. self.CloseLabel.config(text = "you may close now")
  187. sys.exit()
  188. self.CloseLabel.config(text = "I tried, I failed")
  189. break
  190. else:
  191. print("NOTHING IS STOPPING ME!")
  192. Labels.config(text = "Finished the article!")
  193. #brand = divWithInfo.div.a.img["title"]
  194. #title_container = divWithInfo.find("a", "item-title")
  195. #product_name = title_container.text
  196. #shipping_container = divWithInfo.find("li", "price-ship")
  197. #shipping_cost = shipping_container.text.strip()
  198. #print("brand:"+brand)
  199. #print("name:"+product_name)
  200. #print("shipping:"+shipping_cost)
  201. #print("\n")
  202. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  203. Labels.config(text = "All Done!")
  204. f.close()
  205. t.close()
  206. else:
  207. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  208. progress['value'] = 0
  209. progress.pack(side = TOP)
  210. Labels = Label(topFrame, text = "SCRAPING")
  211. Labels.pack(side = TOP)
  212. texts = "change"
  213. main_url = 'https://www.theverge.com/tech'
  214. uClient = uReq(main_url)
  215. page_html = uClient.read()
  216. uClient.close()
  217. page_soup = soup(page_html, "html.parser")
  218. containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
  219. Articles = len(containers)
  220. filename = self.directory + "/News.txt"
  221. trans_filename = self.directory + "/TranslatedNews.txt"
  222. f = io.open(filename, "w", encoding="utf-8")
  223. f.write("ACTIVE")
  224. t = io.open(trans_filename, "w", encoding ="utf-8")
  225. t.write("ACTIVE")
  226. Labels.config(text = "setting file!")
  227. i = 0
  228. CurrentTitle = Label(topFrame, text = "Preparing...")
  229. CurrentTitle.pack(side = TOP)
  230. for container in containers:
  231. i = i + 1
  232. Labels.config(text = "jumping to URL!")
  233. print(container["class"])
  234. if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
  235. print("\n WE'VE CATCHED A BUG!")
  236. continue
  237. if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
  238. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  239. print(container.div["class"])
  240. continue
  241. progress['value'] = i * 100 / Articles
  242. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  243. local_progress['value'] = 0
  244. local_progress.pack(side = BOTTOM)
  245. requiredURL = container.div.a["href"]
  246. secondary_URL = requiredURL
  247. print("Set target URL!")
  248. secClient = uReq(secondary_URL)
  249. news_html = secClient.read()
  250. secClient.close()
  251. news_soup = soup(news_html, "html.parser")
  252. news_soup.decode('utf-8', 'ignore')
  253. news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
  254. if len(news_containers)>0:
  255. news_title = news_containers[0].h1.text
  256. CurrentTitle.config(text = news_title)
  257. Labels.config(text = "Extracted Title!")
  258. else:
  259. print("ERROR! NO TITLE AT "+secondary_URL)
  260. Labels.config(text = "Failed to extract title")
  261. news_body = news_soup.findAll("div", {"class":"c-entry-content"})
  262. print("\n TITLE: " + news_title)
  263. f.write("\n \n" + news_title + "\n")
  264. print("Now translating...")
  265. translatedQuery = translate(news_title, "ru", "en")
  266. t.write("\n \n" + translatedQuery + "\n")
  267. paragraphs = news_body[0].findAll("p")
  268. print("Title Recorded!")
  269. local_progress['value'] = 10
  270. y = len(paragraphs)
  271. x = 0
  272. fullText = ""
  273. fullText2 = ""
  274. for paragraph in paragraphs:
  275. x = x + 1
  276. local_progress['value'] = x * 100 / y + 10
  277. stringx = str(x)
  278. Labels.config(text = "Getting paragraph " + stringx + "...")
  279. print(paragraph.text + "\n \n \n")
  280. if x >= y/2:
  281. fullText2 = fullText2 + paragraph.text.strip()
  282. else:
  283. fullText = fullText + paragraph.text.strip()
  284. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  285. print("Writing Paragraph " + stringx + "...")
  286. if self.needToSkip:
  287. break
  288. if self.needToSkip:
  289. self.needToSkip = False
  290. continue
  291. translatedQuery = translate(fullText, "ru", "en")
  292. completeText = translatedQuery
  293. translatedQuery = translate(fullText2, "ru", "en")
  294. completeText = completeText + translatedQuery
  295. f.write("\n" + fullText + fullText2)
  296. t.write("\n" + completeText)
  297. news_picture = news_soup.findAll("picture", {"class":"c-picture"})
  298. Labels.config(text = "Getting image...")
  299. if news_picture[0].img != None:
  300. article_pic = news_picture[0].img.get("src")
  301. Labels.config(text = "Picture recieved!")
  302. else:
  303. print("\n THIS ARTICLE HAS NO PICTURE! ")
  304. Labels.config(text = "Failed to locate picture :(")
  305. local_progress['value'] = 120
  306. f.write("\n PICTURE URL: " + article_pic)
  307. t.write("\n PICTURE URL: " + article_pic)
  308. if self.stop_threads.is_set():
  309. print("I SURRENDER!")
  310. self.stopped = True
  311. f.close()
  312. t.close()
  313. self.CloseLabel.config(text = "you may close now")
  314. sys.exit()
  315. self.CloseLabel.config(text = "I tried, I failed")
  316. break
  317. else:
  318. print("NOTHING IS STOPPING ME!")
  319. Labels.config(text = "Finished the article!")
  320. #brand = divWithInfo.div.a.img["title"]
  321. #title_container = divWithInfo.find("a", "item-title")
  322. #product_name = title_container.text
  323. #shipping_container = divWithInfo.find("li", "price-ship")
  324. #shipping_cost = shipping_container.text.strip()
  325. #print("brand:"+brand)
  326. #print("name:"+product_name)
  327. #print("shipping:"+shipping_cost)
  328. #print("\n")
  329. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  330. Labels.config(text = "All Done!")
  331. f.close()
  332. t.close()
  333. texts = "VERGE SCRAPPER"
  334. root = Tk()
  335. program = Scrapers()
  336. mainT = Thread(target=program.start_now)
  337. try:
  338. texts
  339. except NameError:
  340. theLabel = Label(root, text = "VERGE SCRAPER")
  341. theLabel.pack()
  342. print("NO TEXTS!")
  343. else:
  344. theLabel = Label(root, text = texts)
  345. theLabel.pack()
  346. print("FOUND TEXTS!")
  347. stop_thread = False
  348. topFrame = Frame(root)
  349. topFrame.pack()
  350. bottomFrame = Frame(root)
  351. bottomFrame.pack(side=BOTTOM)
  352. button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
  353. button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
  354. button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
  355. button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
  356. button3.pack(side = TOP)
  357. button1.pack(side= TOP)
  358. button4.pack(side= TOP)
  359. button2.pack(side = TOP)
  360. root.mainloop()