A web-scraping tool for verge and techradar
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

YANDEX.pyw 9.0 KiB

4 lat temu
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. import tkinter
  2. from tkinter import *
  3. from os.path import expanduser
  4. from yandex.Translater import Translater
  5. from tkinter import filedialog
  6. from tkinter.ttk import Progressbar
  7. from urllib.request import urlopen as uReq
  8. from bs4 import BeautifulSoup as soup
  9. import io
  10. import playsound
  11. import sys
  12. import time
  13. from threading import *
  14. import os
  15. from tkinter import messagebox
  16. desktop = expanduser("~/Documents")
  17. tr = Translater()
  18. tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9')
  19. tr.set_from_lang('en')
  20. tr.set_to_lang('ru')
  21. def chooseDirectory():
  22. currdir = os.getcwd()
  23. tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
  24. program.directory = tempdir
  25. class Scrapers(object):
  26. def __init__(self):
  27. self.thread1 = None
  28. self.stop_threads = Event()
  29. self.stopped = False
  30. self.CloseLabel = Label(root, text = "Finalizing before breaking!")
  31. self.directory = desktop
  32. self.needToSkip = False
  33. def waitandkill(self):
  34. time.sleep(1)
  35. if (self.stopped == True):
  36. print("DEAD")
  37. else:
  38. self.waitandkill
  39. def stopTheThread(self):
  40. print("CALLED ME TOO?")
  41. self.stop_threads.set()
  42. self.CloseLabel.pack()
  43. self.waitandkill
  44. print("calling wait")
  45. def skip(self):
  46. self.needToSkip = True
  47. def start_thread(self):
  48. Skip = Button(topFrame, text = "SKIP!", command = self.skip)
  49. Skip.pack(side = BOTTOM)
  50. try:
  51. f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
  52. except IOError:
  53. print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
  54. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
  55. sys.exit()
  56. try:
  57. f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
  58. except IOError:
  59. print("FILE ERROR!" + self.directory + "/News.txt")
  60. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
  61. sys.exit()
  62. if self.thread1!=None:
  63. print("NO!")
  64. else:
  65. self.thread1 = Thread(target = self.start_now)
  66. self.thread1.start()
  67. threadActive = 1
  68. def start_now(self):
  69. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  70. progress['value'] = 0
  71. progress.pack(side = TOP)
  72. Labels = Label(topFrame, text = "SCRAPING")
  73. Labels.pack(side = TOP)
  74. texts = "change"
  75. main_url = 'https://www.theverge.com/tech'
  76. uClient = uReq(main_url)
  77. page_html = uClient.read()
  78. uClient.close()
  79. page_soup = soup(page_html, "html.parser")
  80. containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
  81. Articles = len(containers)
  82. filename = self.directory + "/News.txt"
  83. trans_filename = self.directory + "/TranslatedNews.txt"
  84. f = io.open(filename, "w", encoding="utf-8")
  85. f.write("ACTIVE")
  86. t = io.open(trans_filename, "w", encoding ="utf-8")
  87. t.write("ACTIVE")
  88. Labels.config(text = "setting file!")
  89. i = 0
  90. CurrentTitle = Label(topFrame, text = "Preparing...")
  91. CurrentTitle.pack(side = TOP)
  92. for container in containers:
  93. i = i + 1
  94. Labels.config(text = "jumping to URL!")
  95. print(container["class"])
  96. if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
  97. print("\n WE'VE CATCHED A BUG!")
  98. continue
  99. if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
  100. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  101. print(container.div["class"])
  102. continue
  103. progress['value'] = i * 100 / Articles
  104. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  105. local_progress['value'] = 0
  106. local_progress.pack(side = BOTTOM)
  107. requiredURL = container.div.a["href"]
  108. secondary_URL = requiredURL
  109. print("Set target URL!")
  110. secClient = uReq(secondary_URL)
  111. news_html = secClient.read()
  112. secClient.close()
  113. news_soup = soup(news_html, "html.parser")
  114. news_soup.decode('utf-8', 'ignore')
  115. news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
  116. if len(news_containers)>0:
  117. news_title = news_containers[0].h1.text
  118. CurrentTitle.config(text = news_title)
  119. Labels.config(text = "Extracted Title!")
  120. else:
  121. print("ERROR! NO TITLE AT "+secondary_URL)
  122. Labels.config(text = "Failed to extract title")
  123. news_body = news_soup.findAll("div", {"class":"c-entry-content"})
  124. print("\n TITLE: " + news_title)
  125. f.write("\n \n" + news_title + "\n")
  126. print("Now translating...")
  127. tr.set_text(news_title)
  128. translatedQuery = tr.translate()
  129. t.write("\n \n" + translatedQuery + "\n")
  130. paragraphs = news_body[0].findAll("p")
  131. print("Title Recorded!")
  132. local_progress['value'] = 10
  133. y = len(paragraphs)
  134. x = 0
  135. fullText = ""
  136. fullText2 = ""
  137. for paragraph in paragraphs:
  138. x = x + 1
  139. local_progress['value'] = x * 100 / y + 10
  140. stringx = str(x)
  141. Labels.config(text = "Getting paragraph " + stringx + "...")
  142. print(paragraph.text + "\n \n \n")
  143. if x >= y/2:
  144. fullText2 = fullText2 + paragraph.text.strip()
  145. else:
  146. fullText = fullText + paragraph.text.strip()
  147. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  148. print("Writing Paragraph " + stringx + "...")
  149. if self.needToSkip:
  150. break
  151. if self.needToSkip:
  152. self.needToSkip = False
  153. continue
  154. tr.set_text((fullText))
  155. translatedQuery = tr.translate()
  156. completeText = translatedQuery
  157. tr.set_text((fullText))
  158. translatedQuery = tr.translate()
  159. completeText = completeText + translatedQuery
  160. f.write("\n" + fullText + fullText2)
  161. t.write("\n" + completeText)
  162. news_picture = news_soup.findAll("picture", {"class":"c-picture"})
  163. Labels.config(text = "Getting image...")
  164. if news_picture[0].img != None:
  165. article_pic = news_picture[0].img.get("src")
  166. Labels.config(text = "Picture recieved!")
  167. else:
  168. print("\n THIS ARTICLE HAS NO PICTURE! ")
  169. Labels.config(text = "Failed to locate picture :(")
  170. local_progress['value'] = 120
  171. f.write("\n PICTURE URL: " + article_pic)
  172. t.write("\n PICTURE URL: " + article_pic)
  173. if self.stop_threads.is_set():
  174. print("I SURRENDER!")
  175. self.stopped = True
  176. f.close()
  177. t.close()
  178. self.CloseLabel.config(text = "you may close now")
  179. sys.exit()
  180. self.CloseLabel.config(text = "I tried, I failed")
  181. break
  182. else:
  183. print("NOTHING IS STOPPING ME!")
  184. Labels.config(text = "Finished the article!")
  185. #brand = divWithInfo.div.a.img["title"]
  186. #title_container = divWithInfo.find("a", "item-title")
  187. #product_name = title_container.text
  188. #shipping_container = divWithInfo.find("li", "price-ship")
  189. #shipping_cost = shipping_container.text.strip()
  190. #print("brand:"+brand)
  191. #print("name:"+product_name)
  192. #print("shipping:"+shipping_cost)
  193. #print("\n")
  194. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  195. Labels.config(text = "All Done!")
  196. f.close()
  197. t.close()
  198. texts = "VERGE SCRAPPER"
  199. root = Tk()
  200. program = Scrapers()
  201. mainT = Thread(target=program.start_now)
  202. try:
  203. texts
  204. except NameError:
  205. theLabel = Label(root, text = "VERGE SCRAPER")
  206. theLabel.pack()
  207. print("NO TEXTS!")
  208. else:
  209. theLabel = Label(root, text = texts)
  210. theLabel.pack()
  211. print("FOUND TEXTS!")
  212. stop_thread = False
  213. topFrame = Frame(root)
  214. topFrame.pack()
  215. bottomFrame = Frame(root)
  216. bottomFrame.pack(side=BOTTOM)
  217. button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
  218. button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
  219. button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
  220. button3.pack(side = TOP)
  221. button1.pack(side= TOP)
  222. button2.pack(side = TOP)
  223. root.mainloop()