|
|
@@ -1,375 +0,0 @@ |
|
|
|
import tkinter
|
|
|
|
from tkinter import *
|
|
|
|
from os.path import expanduser
|
|
|
|
|
|
|
|
from tkinter import filedialog
|
|
|
|
from tkinter.ttk import Progressbar
|
|
|
|
from urllib.request import urlopen as uReq
|
|
|
|
from bs4 import BeautifulSoup as soup
|
|
|
|
import io
|
|
|
|
from mtranslate import translate
|
|
|
|
import playsound
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
from threading import *
|
|
|
|
import os
|
|
|
|
from tkinter import messagebox
|
|
|
|
desktop = expanduser("~/Documents")
|
|
|
|
agency = "verge"
|
|
|
|
def chooseDirectory():
|
|
|
|
currdir = os.getcwd()
|
|
|
|
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
|
|
|
|
program.directory = tempdir
|
|
|
|
def switchAgencies(agencies):
|
|
|
|
print("called Agencies")
|
|
|
|
if agencies == "verge":
|
|
|
|
print("switching to techradar")
|
|
|
|
agencies = "techradar"
|
|
|
|
else:
|
|
|
|
print("switching to verge")
|
|
|
|
agencies = "verge"
|
|
|
|
button4['text'] = agencies
|
|
|
|
class Scrapers(object):
|
|
|
|
def __init__(self):
|
|
|
|
self.thread1 = None
|
|
|
|
self.stop_threads = Event()
|
|
|
|
self.stopped = False
|
|
|
|
self.CloseLabel = Label(root, text = "Finalizing before breaking!")
|
|
|
|
self.directory = desktop
|
|
|
|
self.needToSkip = False
|
|
|
|
def waitandkill(self):
|
|
|
|
time.sleep(1)
|
|
|
|
if (self.stopped == True):
|
|
|
|
print("DEAD")
|
|
|
|
else:
|
|
|
|
self.waitandkill
|
|
|
|
def stopTheThread(self):
|
|
|
|
print("CALLED ME TOO?")
|
|
|
|
self.stop_threads.set()
|
|
|
|
self.CloseLabel.pack()
|
|
|
|
self.waitandkill
|
|
|
|
print("calling wait")
|
|
|
|
def skip(self):
|
|
|
|
self.needToSkip = True
|
|
|
|
def start_thread(self):
|
|
|
|
Skip = Button(topFrame, text = "SKIP!", command = self.skip)
|
|
|
|
Skip.pack(side = BOTTOM)
|
|
|
|
try:
|
|
|
|
f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
|
|
|
|
except IOError:
|
|
|
|
print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
|
|
|
|
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
|
|
|
|
sys.exit()
|
|
|
|
try:
|
|
|
|
f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
|
|
|
|
except IOError:
|
|
|
|
print("FILE ERROR!" + self.directory + "/News.txt")
|
|
|
|
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
|
|
|
|
sys.exit()
|
|
|
|
if self.thread1!=None:
|
|
|
|
print("NO!")
|
|
|
|
else:
|
|
|
|
self.thread1 = Thread(target = self.start_now)
|
|
|
|
self.thread1.start()
|
|
|
|
threadActive = 1
|
|
|
|
def start_now(self):
|
|
|
|
print("Getting" + button4['text'])
|
|
|
|
if button4['text'] == "techradar":
|
|
|
|
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
|
|
|
|
progress['value'] = 0
|
|
|
|
progress.pack(side = TOP)
|
|
|
|
Labels = Label(topFrame, text = "SCRAPING")
|
|
|
|
Labels.pack(side = TOP)
|
|
|
|
texts = "change"
|
|
|
|
main_url = 'https://www.techradar.com/news'
|
|
|
|
uClient = uReq(main_url)
|
|
|
|
page_html = uClient.read()
|
|
|
|
uClient.close()
|
|
|
|
page_soup = soup(page_html, "html.parser")
|
|
|
|
containers = page_soup.findAll("div",{"class":"listingResult"})
|
|
|
|
|
|
|
|
Articles = len(containers)
|
|
|
|
print(Articles)
|
|
|
|
filename = self.directory + "/News.txt"
|
|
|
|
trans_filename = self.directory + "/TranslatedNews.txt"
|
|
|
|
f = io.open(filename, "w", encoding="utf-8")
|
|
|
|
f.write("ACTIVE")
|
|
|
|
t = io.open(trans_filename, "w", encoding ="utf-8")
|
|
|
|
t.write("ACTIVE")
|
|
|
|
Labels.config(text = "setting file!")
|
|
|
|
i = 0
|
|
|
|
CurrentTitle = Label(topFrame, text = "Preparing...")
|
|
|
|
CurrentTitle.pack(side = TOP)
|
|
|
|
for container in containers:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
i = i + 1
|
|
|
|
Labels.config(text = "jumping to URL!")
|
|
|
|
print(container["class"])
|
|
|
|
if 'sponsored-post' in container["class"]:
|
|
|
|
print("\n WE'VE CATCHED AN AD!")
|
|
|
|
continue
|
|
|
|
progress['value'] = i * 100 / Articles
|
|
|
|
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
|
|
|
|
local_progress['value'] = 0
|
|
|
|
local_progress.pack(side = BOTTOM)
|
|
|
|
requiredURL = container.a["href"]
|
|
|
|
secondary_URL = requiredURL
|
|
|
|
print("Set target URL!" + requiredURL)
|
|
|
|
secClient = uReq(secondary_URL)
|
|
|
|
news_html = secClient.read()
|
|
|
|
secClient.close()
|
|
|
|
news_soup = soup(news_html, "html.parser")
|
|
|
|
news_soup.decode('utf-8', 'ignore')
|
|
|
|
squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
|
|
|
|
print(len(squash))
|
|
|
|
if len(squash)>0:
|
|
|
|
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
|
|
|
|
print(container.div["class"])
|
|
|
|
continue
|
|
|
|
news_containers = news_soup.findAll("header")
|
|
|
|
if len(news_containers)>0:
|
|
|
|
news_title = news_containers[0].h1.text
|
|
|
|
CurrentTitle.config(text = news_title)
|
|
|
|
Labels.config(text = "Extracted Title!")
|
|
|
|
else:
|
|
|
|
print("ERROR! NO TITLE AT "+secondary_URL)
|
|
|
|
Labels.config(text = "Failed to extract title")
|
|
|
|
news_body = news_soup.findAll("div", {"id":"article-body"})
|
|
|
|
|
|
|
|
print("\n TITLE: " + news_title)
|
|
|
|
f.write("\n \n" + news_title + "\n")
|
|
|
|
print("Now translating...")
|
|
|
|
translatedQuery = translate(news_title, "ru", "en")
|
|
|
|
t.write("\n \n" + translatedQuery + "\n")
|
|
|
|
paragraphs = news_body[0].findAll("p")
|
|
|
|
print("Title Recorded!")
|
|
|
|
local_progress['value'] = 10
|
|
|
|
y = len(paragraphs)
|
|
|
|
x = 0
|
|
|
|
fullText = ""
|
|
|
|
fullText2 = ""
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
|
|
|
|
x = x + 1
|
|
|
|
local_progress['value'] = x * 100 / y + 10
|
|
|
|
stringx = str(x)
|
|
|
|
Labels.config(text = "Getting paragraph " + stringx + "...")
|
|
|
|
print(paragraph.text + "\n \n \n")
|
|
|
|
if x >= y/2:
|
|
|
|
fullText2 = fullText2 + paragraph.text.strip()
|
|
|
|
else:
|
|
|
|
fullText = fullText + paragraph.text.strip()
|
|
|
|
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
|
|
|
|
print("Writing Paragraph " + stringx + "...")
|
|
|
|
if self.needToSkip:
|
|
|
|
break
|
|
|
|
|
|
|
|
if self.needToSkip:
|
|
|
|
self.needToSkip = False
|
|
|
|
continue
|
|
|
|
translatedQuery = translate(fullText, "ru", "en")
|
|
|
|
completeText = translatedQuery
|
|
|
|
translatedQuery = translate(fullText2, "ru", "en")
|
|
|
|
completeText = completeText + translatedQuery
|
|
|
|
f.write("\n" + fullText + fullText2)
|
|
|
|
t.write("\n" + completeText)
|
|
|
|
news_picture = news_soup.findAll("source", {"class":"hero-image"})
|
|
|
|
Labels.config(text = "Getting image...")
|
|
|
|
if len(news_picture) > 0:
|
|
|
|
article_pic = news_picture[0].get("data-original-mos")
|
|
|
|
Labels.config(text = "Picture recieved!")
|
|
|
|
else:
|
|
|
|
print("\n THIS ARTICLE HAS NO PICTURE! ")
|
|
|
|
Labels.config(text = "Failed to locate picture :(")
|
|
|
|
local_progress['value'] = 120
|
|
|
|
f.write("\n PICTURE URL: " + article_pic)
|
|
|
|
t.write("\n PICTURE URL: " + article_pic)
|
|
|
|
if self.stop_threads.is_set():
|
|
|
|
print("I SURRENDER!")
|
|
|
|
self.stopped = True
|
|
|
|
f.close()
|
|
|
|
t.close()
|
|
|
|
self.CloseLabel.config(text = "you may close now")
|
|
|
|
sys.exit()
|
|
|
|
self.CloseLabel.config(text = "I tried, I failed")
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print("NOTHING IS STOPPING ME!")
|
|
|
|
Labels.config(text = "Finished the article!")
|
|
|
|
#brand = divWithInfo.div.a.img["title"]
|
|
|
|
#title_container = divWithInfo.find("a", "item-title")
|
|
|
|
#product_name = title_container.text
|
|
|
|
#shipping_container = divWithInfo.find("li", "price-ship")
|
|
|
|
#shipping_cost = shipping_container.text.strip()
|
|
|
|
|
|
|
|
#print("brand:"+brand)
|
|
|
|
#print("name:"+product_name)
|
|
|
|
#print("shipping:"+shipping_cost)
|
|
|
|
#print("\n")
|
|
|
|
|
|
|
|
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
|
|
|
|
Labels.config(text = "All Done!")
|
|
|
|
f.close()
|
|
|
|
t.close()
|
|
|
|
else:
|
|
|
|
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
|
|
|
|
progress['value'] = 0
|
|
|
|
progress.pack(side = TOP)
|
|
|
|
Labels = Label(topFrame, text = "SCRAPING")
|
|
|
|
Labels.pack(side = TOP)
|
|
|
|
texts = "change"
|
|
|
|
main_url = 'https://www.theverge.com/tech'
|
|
|
|
uClient = uReq(main_url)
|
|
|
|
page_html = uClient.read()
|
|
|
|
uClient.close()
|
|
|
|
page_soup = soup(page_html, "html.parser")
|
|
|
|
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
|
|
|
|
Articles = len(containers)
|
|
|
|
filename = self.directory + "/News.txt"
|
|
|
|
trans_filename = self.directory + "/TranslatedNews.txt"
|
|
|
|
f = io.open(filename, "w", encoding="utf-8")
|
|
|
|
f.write("ACTIVE")
|
|
|
|
t = io.open(trans_filename, "w", encoding ="utf-8")
|
|
|
|
t.write("ACTIVE")
|
|
|
|
Labels.config(text = "setting file!")
|
|
|
|
i = 0
|
|
|
|
CurrentTitle = Label(topFrame, text = "Preparing...")
|
|
|
|
CurrentTitle.pack(side = TOP)
|
|
|
|
for container in containers:
|
|
|
|
i = i + 1
|
|
|
|
Labels.config(text = "jumping to URL!")
|
|
|
|
print(container["class"])
|
|
|
|
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
|
|
|
|
print("\n WE'VE CATCHED A BUG!")
|
|
|
|
continue
|
|
|
|
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
|
|
|
|
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
|
|
|
|
print(container.div["class"])
|
|
|
|
continue
|
|
|
|
progress['value'] = i * 100 / Articles
|
|
|
|
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
|
|
|
|
local_progress['value'] = 0
|
|
|
|
local_progress.pack(side = BOTTOM)
|
|
|
|
requiredURL = container.div.a["href"]
|
|
|
|
secondary_URL = requiredURL
|
|
|
|
print("Set target URL!")
|
|
|
|
secClient = uReq(secondary_URL)
|
|
|
|
news_html = secClient.read()
|
|
|
|
secClient.close()
|
|
|
|
news_soup = soup(news_html, "html.parser")
|
|
|
|
news_soup.decode('utf-8', 'ignore')
|
|
|
|
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
|
|
|
|
if len(news_containers)>0:
|
|
|
|
news_title = news_containers[0].h1.text
|
|
|
|
CurrentTitle.config(text = news_title)
|
|
|
|
Labels.config(text = "Extracted Title!")
|
|
|
|
else:
|
|
|
|
print("ERROR! NO TITLE AT "+secondary_URL)
|
|
|
|
Labels.config(text = "Failed to extract title")
|
|
|
|
news_body = news_soup.findAll("div", {"class":"c-entry-content"})
|
|
|
|
print("\n TITLE: " + news_title)
|
|
|
|
f.write("\n \n" + news_title + "\n")
|
|
|
|
print("Now translating...")
|
|
|
|
translatedQuery = translate(news_title, "ru", "en")
|
|
|
|
t.write("\n \n" + translatedQuery + "\n")
|
|
|
|
paragraphs = news_body[0].findAll("p")
|
|
|
|
print("Title Recorded!")
|
|
|
|
local_progress['value'] = 10
|
|
|
|
y = len(paragraphs)
|
|
|
|
x = 0
|
|
|
|
fullText = ""
|
|
|
|
fullText2 = ""
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
|
|
|
|
x = x + 1
|
|
|
|
local_progress['value'] = x * 100 / y + 10
|
|
|
|
stringx = str(x)
|
|
|
|
Labels.config(text = "Getting paragraph " + stringx + "...")
|
|
|
|
print(paragraph.text + "\n \n \n")
|
|
|
|
if x >= y/2:
|
|
|
|
fullText2 = fullText2 + paragraph.text.strip()
|
|
|
|
else:
|
|
|
|
fullText = fullText + paragraph.text.strip()
|
|
|
|
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
|
|
|
|
print("Writing Paragraph " + stringx + "...")
|
|
|
|
if self.needToSkip:
|
|
|
|
break
|
|
|
|
|
|
|
|
if self.needToSkip:
|
|
|
|
self.needToSkip = False
|
|
|
|
continue
|
|
|
|
translatedQuery = translate(fullText, "ru", "en")
|
|
|
|
completeText = translatedQuery
|
|
|
|
translatedQuery = translate(fullText2, "ru", "en")
|
|
|
|
completeText = completeText + translatedQuery
|
|
|
|
f.write("\n" + fullText + fullText2)
|
|
|
|
t.write("\n" + completeText)
|
|
|
|
news_picture = news_soup.findAll("picture", {"class":"c-picture"})
|
|
|
|
Labels.config(text = "Getting image...")
|
|
|
|
if news_picture[0].img != None:
|
|
|
|
article_pic = news_picture[0].img.get("src")
|
|
|
|
Labels.config(text = "Picture recieved!")
|
|
|
|
else:
|
|
|
|
print("\n THIS ARTICLE HAS NO PICTURE! ")
|
|
|
|
Labels.config(text = "Failed to locate picture :(")
|
|
|
|
local_progress['value'] = 120
|
|
|
|
f.write("\n PICTURE URL: " + article_pic)
|
|
|
|
t.write("\n PICTURE URL: " + article_pic)
|
|
|
|
if self.stop_threads.is_set():
|
|
|
|
print("I SURRENDER!")
|
|
|
|
self.stopped = True
|
|
|
|
f.close()
|
|
|
|
t.close()
|
|
|
|
self.CloseLabel.config(text = "you may close now")
|
|
|
|
sys.exit()
|
|
|
|
self.CloseLabel.config(text = "I tried, I failed")
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print("NOTHING IS STOPPING ME!")
|
|
|
|
Labels.config(text = "Finished the article!")
|
|
|
|
#brand = divWithInfo.div.a.img["title"]
|
|
|
|
#title_container = divWithInfo.find("a", "item-title")
|
|
|
|
#product_name = title_container.text
|
|
|
|
#shipping_container = divWithInfo.find("li", "price-ship")
|
|
|
|
#shipping_cost = shipping_container.text.strip()
|
|
|
|
|
|
|
|
#print("brand:"+brand)
|
|
|
|
#print("name:"+product_name)
|
|
|
|
#print("shipping:"+shipping_cost)
|
|
|
|
#print("\n")
|
|
|
|
|
|
|
|
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
|
|
|
|
Labels.config(text = "All Done!")
|
|
|
|
f.close()
|
|
|
|
t.close()
|
|
|
|
texts = "VERGE SCRAPPER"
|
|
|
|
root = Tk()
|
|
|
|
program = Scrapers()
|
|
|
|
mainT = Thread(target=program.start_now)
|
|
|
|
try:
|
|
|
|
texts
|
|
|
|
except NameError:
|
|
|
|
theLabel = Label(root, text = "VERGE SCRAPER")
|
|
|
|
theLabel.pack()
|
|
|
|
print("NO TEXTS!")
|
|
|
|
else:
|
|
|
|
theLabel = Label(root, text = texts)
|
|
|
|
theLabel.pack()
|
|
|
|
print("FOUND TEXTS!")
|
|
|
|
|
|
|
|
stop_thread = False
|
|
|
|
topFrame = Frame(root)
|
|
|
|
topFrame.pack()
|
|
|
|
bottomFrame = Frame(root)
|
|
|
|
bottomFrame.pack(side=BOTTOM)
|
|
|
|
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
|
|
|
|
button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
|
|
|
|
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
|
|
|
|
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
|
|
|
|
button3.pack(side = TOP)
|
|
|
|
button1.pack(side= TOP)
|
|
|
|
button4.pack(side= TOP)
|
|
|
|
button2.pack(side = TOP)
|
|
|
|
root.mainloop() |