A web-scraping tool for verge and techradar
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

375 řádky
16 KiB

  1. import tkinter
  2. from tkinter import *
  3. from os.path import expanduser
  4. from tkinter import filedialog
  5. from tkinter.ttk import Progressbar
  6. from urllib.request import urlopen as uReq
  7. from bs4 import BeautifulSoup as soup
  8. import io
  9. from mtranslate import translate
  10. import playsound
  11. import sys
  12. import time
  13. from threading import *
  14. import os
  15. from tkinter import messagebox
  16. desktop = expanduser("~/Documents")
  17. agency = "verge"
  18. def chooseDirectory():
  19. currdir = os.getcwd()
  20. tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
  21. program.directory = tempdir
  22. def switchAgencies(agencies):
  23. print("called Agencies")
  24. if agencies == "verge":
  25. print("switching to techradar")
  26. agencies = "techradar"
  27. else:
  28. print("switching to verge")
  29. agencies = "verge"
  30. button4['text'] = agencies
  31. class Scrapers(object):
  32. def __init__(self):
  33. self.thread1 = None
  34. self.stop_threads = Event()
  35. self.stopped = False
  36. self.CloseLabel = Label(root, text = "Finalizing before breaking!")
  37. self.directory = desktop
  38. self.needToSkip = False
  39. def waitandkill(self):
  40. time.sleep(1)
  41. if (self.stopped == True):
  42. print("DEAD")
  43. else:
  44. self.waitandkill
  45. def stopTheThread(self):
  46. print("CALLED ME TOO?")
  47. self.stop_threads.set()
  48. self.CloseLabel.pack()
  49. self.waitandkill
  50. print("calling wait")
  51. def skip(self):
  52. self.needToSkip = True
  53. def start_thread(self):
  54. Skip = Button(topFrame, text = "SKIP!", command = self.skip)
  55. Skip.pack(side = BOTTOM)
  56. try:
  57. f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
  58. except IOError:
  59. print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
  60. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
  61. sys.exit()
  62. try:
  63. f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
  64. except IOError:
  65. print("FILE ERROR!" + self.directory + "/News.txt")
  66. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
  67. sys.exit()
  68. if self.thread1!=None:
  69. print("NO!")
  70. else:
  71. self.thread1 = Thread(target = self.start_now)
  72. self.thread1.start()
  73. threadActive = 1
  74. def start_now(self):
  75. print("Getting" + button4['text'])
  76. if button4['text'] == "techradar":
  77. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  78. progress['value'] = 0
  79. progress.pack(side = TOP)
  80. Labels = Label(topFrame, text = "SCRAPING")
  81. Labels.pack(side = TOP)
  82. texts = "change"
  83. main_url = 'https://www.techradar.com/news'
  84. uClient = uReq(main_url)
  85. page_html = uClient.read()
  86. uClient.close()
  87. page_soup = soup(page_html, "html.parser")
  88. containers = page_soup.findAll("div",{"class":"listingResult"})
  89. Articles = len(containers)
  90. print(Articles)
  91. filename = self.directory + "/News.txt"
  92. trans_filename = self.directory + "/TranslatedNews.txt"
  93. f = io.open(filename, "w", encoding="utf-8")
  94. f.write("ACTIVE")
  95. t = io.open(trans_filename, "w", encoding ="utf-8")
  96. t.write("ACTIVE")
  97. Labels.config(text = "setting file!")
  98. i = 0
  99. CurrentTitle = Label(topFrame, text = "Preparing...")
  100. CurrentTitle.pack(side = TOP)
  101. for container in containers:
  102. i = i + 1
  103. Labels.config(text = "jumping to URL!")
  104. print(container["class"])
  105. if 'sponsored-post' in container["class"]:
  106. print("\n WE'VE CATCHED AN AD!")
  107. continue
  108. progress['value'] = i * 100 / Articles
  109. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  110. local_progress['value'] = 0
  111. local_progress.pack(side = BOTTOM)
  112. requiredURL = container.a["href"]
  113. secondary_URL = requiredURL
  114. print("Set target URL!" + requiredURL)
  115. secClient = uReq(secondary_URL)
  116. news_html = secClient.read()
  117. secClient.close()
  118. news_soup = soup(news_html, "html.parser")
  119. news_soup.decode('utf-8', 'ignore')
  120. squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
  121. print(len(squash))
  122. if len(squash)>0:
  123. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  124. print(container.div["class"])
  125. continue
  126. news_containers = news_soup.findAll("header")
  127. if len(news_containers)>0:
  128. news_title = news_containers[0].h1.text
  129. CurrentTitle.config(text = news_title)
  130. Labels.config(text = "Extracted Title!")
  131. else:
  132. print("ERROR! NO TITLE AT "+secondary_URL)
  133. Labels.config(text = "Failed to extract title")
  134. news_body = news_soup.findAll("div", {"id":"article-body"})
  135. print("\n TITLE: " + news_title)
  136. f.write("\n \n" + news_title + "\n")
  137. print("Now translating...")
  138. translatedQuery = translate(news_title, "ru", "en")
  139. t.write("\n \n" + translatedQuery + "\n")
  140. paragraphs = news_body[0].findAll("p")
  141. print("Title Recorded!")
  142. local_progress['value'] = 10
  143. y = len(paragraphs)
  144. x = 0
  145. fullText = ""
  146. fullText2 = ""
  147. for paragraph in paragraphs:
  148. x = x + 1
  149. local_progress['value'] = x * 100 / y + 10
  150. stringx = str(x)
  151. Labels.config(text = "Getting paragraph " + stringx + "...")
  152. print(paragraph.text + "\n \n \n")
  153. if x >= y/2:
  154. fullText2 = fullText2 + paragraph.text.strip()
  155. else:
  156. fullText = fullText + paragraph.text.strip()
  157. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  158. print("Writing Paragraph " + stringx + "...")
  159. if self.needToSkip:
  160. break
  161. if self.needToSkip:
  162. self.needToSkip = False
  163. continue
  164. translatedQuery = translate(fullText, "ru", "en")
  165. completeText = translatedQuery
  166. translatedQuery = translate(fullText2, "ru", "en")
  167. completeText = completeText + translatedQuery
  168. f.write("\n" + fullText + fullText2)
  169. t.write("\n" + completeText)
  170. news_picture = news_soup.findAll("source", {"class":"hero-image"})
  171. Labels.config(text = "Getting image...")
  172. if len(news_picture) > 0:
  173. article_pic = news_picture[0].get("data-original-mos")
  174. Labels.config(text = "Picture recieved!")
  175. else:
  176. print("\n THIS ARTICLE HAS NO PICTURE! ")
  177. Labels.config(text = "Failed to locate picture :(")
  178. local_progress['value'] = 120
  179. f.write("\n PICTURE URL: " + article_pic)
  180. t.write("\n PICTURE URL: " + article_pic)
  181. if self.stop_threads.is_set():
  182. print("I SURRENDER!")
  183. self.stopped = True
  184. f.close()
  185. t.close()
  186. self.CloseLabel.config(text = "you may close now")
  187. sys.exit()
  188. self.CloseLabel.config(text = "I tried, I failed")
  189. break
  190. else:
  191. print("NOTHING IS STOPPING ME!")
  192. Labels.config(text = "Finished the article!")
  193. #brand = divWithInfo.div.a.img["title"]
  194. #title_container = divWithInfo.find("a", "item-title")
  195. #product_name = title_container.text
  196. #shipping_container = divWithInfo.find("li", "price-ship")
  197. #shipping_cost = shipping_container.text.strip()
  198. #print("brand:"+brand)
  199. #print("name:"+product_name)
  200. #print("shipping:"+shipping_cost)
  201. #print("\n")
  202. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  203. Labels.config(text = "All Done!")
  204. f.close()
  205. t.close()
  206. else:
  207. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  208. progress['value'] = 0
  209. progress.pack(side = TOP)
  210. Labels = Label(topFrame, text = "SCRAPING")
  211. Labels.pack(side = TOP)
  212. texts = "change"
  213. main_url = 'https://www.theverge.com/tech'
  214. uClient = uReq(main_url)
  215. page_html = uClient.read()
  216. uClient.close()
  217. page_soup = soup(page_html, "html.parser")
  218. containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
  219. Articles = len(containers)
  220. filename = self.directory + "/News.txt"
  221. trans_filename = self.directory + "/TranslatedNews.txt"
  222. f = io.open(filename, "w", encoding="utf-8")
  223. f.write("ACTIVE")
  224. t = io.open(trans_filename, "w", encoding ="utf-8")
  225. t.write("ACTIVE")
  226. Labels.config(text = "setting file!")
  227. i = 0
  228. CurrentTitle = Label(topFrame, text = "Preparing...")
  229. CurrentTitle.pack(side = TOP)
  230. for container in containers:
  231. i = i + 1
  232. Labels.config(text = "jumping to URL!")
  233. print(container["class"])
  234. if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
  235. print("\n WE'VE CATCHED A BUG!")
  236. continue
  237. if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
  238. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  239. print(container.div["class"])
  240. continue
  241. progress['value'] = i * 100 / Articles
  242. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  243. local_progress['value'] = 0
  244. local_progress.pack(side = BOTTOM)
  245. requiredURL = container.div.a["href"]
  246. secondary_URL = requiredURL
  247. print("Set target URL!")
  248. secClient = uReq(secondary_URL)
  249. news_html = secClient.read()
  250. secClient.close()
  251. news_soup = soup(news_html, "html.parser")
  252. news_soup.decode('utf-8', 'ignore')
  253. news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
  254. if len(news_containers)>0:
  255. news_title = news_containers[0].h1.text
  256. CurrentTitle.config(text = news_title)
  257. Labels.config(text = "Extracted Title!")
  258. else:
  259. print("ERROR! NO TITLE AT "+secondary_URL)
  260. Labels.config(text = "Failed to extract title")
  261. news_body = news_soup.findAll("div", {"class":"c-entry-content"})
  262. print("\n TITLE: " + news_title)
  263. f.write("\n \n" + news_title + "\n")
  264. print("Now translating...")
  265. translatedQuery = translate(news_title, "ru", "en")
  266. t.write("\n \n" + translatedQuery + "\n")
  267. paragraphs = news_body[0].findAll("p")
  268. print("Title Recorded!")
  269. local_progress['value'] = 10
  270. y = len(paragraphs)
  271. x = 0
  272. fullText = ""
  273. fullText2 = ""
  274. for paragraph in paragraphs:
  275. x = x + 1
  276. local_progress['value'] = x * 100 / y + 10
  277. stringx = str(x)
  278. Labels.config(text = "Getting paragraph " + stringx + "...")
  279. print(paragraph.text + "\n \n \n")
  280. if x >= y/2:
  281. fullText2 = fullText2 + paragraph.text.strip()
  282. else:
  283. fullText = fullText + paragraph.text.strip()
  284. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  285. print("Writing Paragraph " + stringx + "...")
  286. if self.needToSkip:
  287. break
  288. if self.needToSkip:
  289. self.needToSkip = False
  290. continue
  291. translatedQuery = translate(fullText, "ru", "en")
  292. completeText = translatedQuery
  293. translatedQuery = translate(fullText2, "ru", "en")
  294. completeText = completeText + translatedQuery
  295. f.write("\n" + fullText + fullText2)
  296. t.write("\n" + completeText)
  297. news_picture = news_soup.findAll("picture", {"class":"c-picture"})
  298. Labels.config(text = "Getting image...")
  299. if news_picture[0].img != None:
  300. article_pic = news_picture[0].img.get("src")
  301. Labels.config(text = "Picture recieved!")
  302. else:
  303. print("\n THIS ARTICLE HAS NO PICTURE! ")
  304. Labels.config(text = "Failed to locate picture :(")
  305. local_progress['value'] = 120
  306. f.write("\n PICTURE URL: " + article_pic)
  307. t.write("\n PICTURE URL: " + article_pic)
  308. if self.stop_threads.is_set():
  309. print("I SURRENDER!")
  310. self.stopped = True
  311. f.close()
  312. t.close()
  313. self.CloseLabel.config(text = "you may close now")
  314. sys.exit()
  315. self.CloseLabel.config(text = "I tried, I failed")
  316. break
  317. else:
  318. print("NOTHING IS STOPPING ME!")
  319. Labels.config(text = "Finished the article!")
  320. #brand = divWithInfo.div.a.img["title"]
  321. #title_container = divWithInfo.find("a", "item-title")
  322. #product_name = title_container.text
  323. #shipping_container = divWithInfo.find("li", "price-ship")
  324. #shipping_cost = shipping_container.text.strip()
  325. #print("brand:"+brand)
  326. #print("name:"+product_name)
  327. #print("shipping:"+shipping_cost)
  328. #print("\n")
  329. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  330. Labels.config(text = "All Done!")
  331. f.close()
  332. t.close()
  333. texts = "VERGE SCRAPPER"
  334. root = Tk()
  335. program = Scrapers()
  336. mainT = Thread(target=program.start_now)
  337. try:
  338. texts
  339. except NameError:
  340. theLabel = Label(root, text = "VERGE SCRAPER")
  341. theLabel.pack()
  342. print("NO TEXTS!")
  343. else:
  344. theLabel = Label(root, text = texts)
  345. theLabel.pack()
  346. print("FOUND TEXTS!")
  347. stop_thread = False
  348. topFrame = Frame(root)
  349. topFrame.pack()
  350. bottomFrame = Frame(root)
  351. bottomFrame.pack(side=BOTTOM)
  352. button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
  353. button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
  354. button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
  355. button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
  356. button3.pack(side = TOP)
  357. button1.pack(side= TOP)
  358. button4.pack(side= TOP)
  359. button2.pack(side = TOP)
  360. root.mainloop()