A tool which can be used to search and play YouTube music!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

222 regels
8.8 KiB

  1. import tkinter
  2. from tkinter import *
  3. from os.path import expanduser
  4. from tkinter import filedialog
  5. from tkinter.ttk import Progressbar
  6. from urllib.request import urlopen as uReq
  7. from bs4 import BeautifulSoup as soup
  8. import io
  9. from mtranslate import translate
  10. import playsound
  11. import sys
  12. import time
  13. from threading import *
  14. import os
  15. from tkinter import messagebox
  16. desktop = expanduser("~/Documents")
  17. def chooseDirectory():
  18. currdir = os.getcwd()
  19. tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
  20. program.directory = tempdir
  21. class Scrapers(object):
  22. def __init__(self):
  23. self.thread1 = None
  24. self.stop_threads = Event()
  25. self.stopped = False
  26. self.CloseLabel = Label(root, text = "Finalizing before breaking!")
  27. self.directory = desktop
  28. self.needToSkip = False
  29. def waitandkill(self):
  30. time.sleep(1)
  31. if (self.stopped == True):
  32. print("DEAD")
  33. else:
  34. self.waitandkill
  35. def stopTheThread(self):
  36. print("CALLED ME TOO?")
  37. self.stop_threads.set()
  38. self.CloseLabel.pack()
  39. self.waitandkill
  40. print("calling wait")
  41. def skip(self):
  42. self.needToSkip = True
  43. def start_thread(self):
  44. Skip = Button(topFrame, text = "SKIP!", command = self.skip)
  45. Skip.pack(side = BOTTOM)
  46. try:
  47. f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
  48. except IOError:
  49. print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
  50. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
  51. sys.exit()
  52. try:
  53. f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
  54. except IOError:
  55. print("FILE ERROR!" + self.directory + "/News.txt")
  56. messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
  57. sys.exit()
  58. if self.thread1!=None:
  59. print("NO!")
  60. else:
  61. self.thread1 = Thread(target = self.start_now)
  62. self.thread1.start()
  63. threadActive = 1
  64. def start_now(self):
  65. progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
  66. progress['value'] = 0
  67. progress.pack(side = TOP)
  68. Labels = Label(topFrame, text = "SCRAPING")
  69. Labels.pack(side = TOP)
  70. texts = "change"
  71. main_url = 'https://www.theverge.com/tech'
  72. uClient = uReq(main_url)
  73. page_html = uClient.read()
  74. uClient.close()
  75. page_soup = soup(page_html, "html.parser")
  76. containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
  77. Articles = len(containers)
  78. filename = self.directory + "/News.txt"
  79. trans_filename = self.directory + "/TranslatedNews.txt"
  80. f = io.open(filename, "w", encoding="utf-8")
  81. f.write("ACTIVE")
  82. t = io.open(trans_filename, "w", encoding ="utf-8")
  83. t.write("ACTIVE")
  84. Labels.config(text = "setting file!")
  85. i = 0
  86. CurrentTitle = Label(topFrame, text = "Preparing...")
  87. CurrentTitle.pack(side = TOP)
  88. for container in containers:
  89. i = i + 1
  90. Labels.config(text = "jumping to URL!")
  91. print(container["class"])
  92. if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
  93. print("\n WE'VE CATCHED A BUG!")
  94. continue
  95. if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
  96. print("\n WARNING! THIS IS NOT AN ARTICLE! ")
  97. print(container.div["class"])
  98. continue
  99. progress['value'] = i * 100 / Articles
  100. local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
  101. local_progress['value'] = 0
  102. local_progress.pack(side = BOTTOM)
  103. requiredURL = container.div.a["href"]
  104. secondary_URL = requiredURL
  105. print("Set target URL!")
  106. secClient = uReq(secondary_URL)
  107. news_html = secClient.read()
  108. secClient.close()
  109. news_soup = soup(news_html, "html.parser")
  110. news_soup.decode('utf-8', 'ignore')
  111. news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
  112. if len(news_containers)>0:
  113. news_title = news_containers[0].h1.text
  114. CurrentTitle.config(text = news_title)
  115. Labels.config(text = "Extracted Title!")
  116. else:
  117. print("ERROR! NO TITLE AT "+secondary_URL)
  118. Labels.config(text = "Failed to extract title")
  119. news_body = news_soup.findAll("div", {"class":"c-entry-content"})
  120. print("\n TITLE: " + news_title)
  121. f.write("\n \n" + news_title + "\n")
  122. print("Now translating...")
  123. translatedQuery = translate(news_title, "ru", "en")
  124. t.write("\n \n" + translatedQuery + "\n")
  125. paragraphs = news_body[0].findAll("p")
  126. print("Title Recorded!")
  127. local_progress['value'] = 10
  128. y = len(paragraphs)
  129. x = 0
  130. fullText = ""
  131. fullText2 = ""
  132. for paragraph in paragraphs:
  133. x = x + 1
  134. local_progress['value'] = x * 100 / y + 10
  135. stringx = str(x)
  136. Labels.config(text = "Getting paragraph " + stringx + "...")
  137. print(paragraph.text + "\n \n \n")
  138. if x >= y/2:
  139. fullText2 = fullText2 + paragraph.text.strip()
  140. else:
  141. fullText = fullText + paragraph.text.strip()
  142. Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
  143. print("Writing Paragraph " + stringx + "...")
  144. if self.needToSkip:
  145. break
  146. if self.needToSkip:
  147. self.needToSkip = False
  148. continue
  149. translatedQuery = translate(fullText, "ru", "en")
  150. completeText = translatedQuery
  151. translatedQuery = translate(fullText2, "ru", "en")
  152. completeText = completeText + translatedQuery
  153. f.write("\n" + fullText + fullText2)
  154. t.write("\n" + completeText)
  155. news_picture = news_soup.findAll("picture", {"class":"c-picture"})
  156. Labels.config(text = "Getting image...")
  157. if news_picture[0].img != None:
  158. article_pic = news_picture[0].img.get("src")
  159. Labels.config(text = "Picture recieved!")
  160. else:
  161. print("\n THIS ARTICLE HAS NO PICTURE! ")
  162. Labels.config(text = "Failed to locate picture :(")
  163. local_progress['value'] = 120
  164. f.write("\n PICTURE URL: " + article_pic)
  165. t.write("\n PICTURE URL: " + article_pic)
  166. if self.stop_threads.is_set():
  167. print("I SURRENDER!")
  168. self.stopped = True
  169. f.close()
  170. t.close()
  171. self.CloseLabel.config(text = "you may close now")
  172. sys.exit()
  173. self.CloseLabel.config(text = "I tried, I failed")
  174. break
  175. else:
  176. print("NOTHING IS STOPPING ME!")
  177. Labels.config(text = "Finished the article!")
  178. #brand = divWithInfo.div.a.img["title"]
  179. #title_container = divWithInfo.find("a", "item-title")
  180. #product_name = title_container.text
  181. #shipping_container = divWithInfo.find("li", "price-ship")
  182. #shipping_cost = shipping_container.text.strip()
  183. #print("brand:"+brand)
  184. #print("name:"+product_name)
  185. #print("shipping:"+shipping_cost)
  186. #print("\n")
  187. #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
  188. Labels.config(text = "All Done!")
  189. f.close()
  190. t.close()
  191. texts = "VERGE SCRAPPER"
  192. root = Tk()
  193. program = Scrapers()
  194. mainT = Thread(target=program.start_now)
  195. try:
  196. texts
  197. except NameError:
  198. theLabel = Label(root, text = "VERGE SCRAPER")
  199. theLabel.pack()
  200. print("NO TEXTS!")
  201. else:
  202. theLabel = Label(root, text = texts)
  203. theLabel.pack()
  204. print("FOUND TEXTS!")
  205. stop_thread = False
  206. topFrame = Frame(root)
  207. topFrame.pack()
  208. bottomFrame = Frame(root)
  209. bottomFrame.pack(side=BOTTOM)
  210. button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
  211. button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
  212. button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
  213. button3.pack(side = TOP)
  214. button1.pack(side= TOP)
  215. button2.pack(side = TOP)
  216. root.mainloop()