Madiwka
/
vergescrapper
Mirror von https://github.com/Madiwka4/vergescrapper

import tkinter
from tkinter import *
from os.path import expanduser

from tkinter import filedialog
from tkinter.ttk import Progressbar
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import io
from mtranslate import translate
import playsound
import sys
import time
from threading import *
import os
from tkinter import messagebox
desktop = expanduser("~/Documents")
agency = "verge"
def chooseDirectory():
    currdir = os.getcwd()
    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
    program.directory = tempdir
def switchAgencies(agencies):
    print("called Agencies")
    if agencies == "verge":
        print("switching to techradar")
        agencies = "techradar"
    else:
        print("switching to verge")
        agencies = "verge"
    button4['text'] = agencies
class Scrapers(object):
    def __init__(self):
        self.thread1 = None
        self.stop_threads = Event()
        self.stopped = False
        self.CloseLabel = Label(root, text = "Finalizing before breaking!")
        self.directory = desktop
        self.needToSkip = False
    def waitandkill(self):
        time.sleep(1)
        if (self.stopped == True):
            print("DEAD")
        else:
            self.waitandkill
    def stopTheThread(self):
        print("CALLED ME TOO?")
        self.stop_threads.set()
        self.CloseLabel.pack()
        self.waitandkill
        print("calling wait")
    def skip(self):
        self.needToSkip = True
    def start_thread(self):
        Skip = Button(topFrame, text = "SKIP!", command = self.skip)
        Skip.pack(side = BOTTOM)
        try:
            f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
            sys.exit()
        try:
            f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/News.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
            sys.exit()
        if self.thread1!=None:
            print("NO!")
        else:
            self.thread1 = Thread(target = self.start_now)
            self.thread1.start()
            threadActive = 1
    def start_now(self):
        print("Getting" + button4['text'])
        if button4['text'] == "techradar":
            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
            progress['value'] = 0
            progress.pack(side = TOP) 
            Labels = Label(topFrame, text = "SCRAPING")
            Labels.pack(side = TOP)
            texts = "change"
            main_url = 'https://www.techradar.com/news'
            uClient = uReq(main_url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containers = page_soup.findAll("div",{"class":"listingResult"})
            
            Articles = len(containers)
            print(Articles)
            filename = self.directory + "/News.txt"
            trans_filename = self.directory + "/TranslatedNews.txt"
            f = io.open(filename, "w", encoding="utf-8")
            f.write("ACTIVE")
            t = io.open(trans_filename, "w", encoding ="utf-8")
            t.write("ACTIVE")
            Labels.config(text = "setting file!")
            i = 0
            CurrentTitle = Label(topFrame, text = "Preparing...")
            CurrentTitle.pack(side = TOP)
            for container in containers:
                
                
           
                i = i + 1 
                Labels.config(text = "jumping to URL!")
                print(container["class"])
                if 'sponsored-post' in container["class"]:
                    print("\n WE'VE CATCHED AN AD!")
                    continue
                progress['value'] = i * 100 / Articles
                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
                local_progress['value'] = 0
                local_progress.pack(side = BOTTOM)
                requiredURL = container.a["href"]
                secondary_URL = requiredURL
                print("Set target URL!" + requiredURL)
                secClient = uReq(secondary_URL)
                news_html = secClient.read()
                secClient.close()
                news_soup = soup(news_html, "html.parser")
                news_soup.decode('utf-8', 'ignore')
                squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
                print(len(squash))
                if len(squash)>0:
                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
                    print(container.div["class"])
                    continue
                news_containers = news_soup.findAll("header")
                if len(news_containers)>0:
                    news_title = news_containers[0].h1.text
                    CurrentTitle.config(text = news_title)
                    Labels.config(text = "Extracted Title!")
                else:
                    print("ERROR! NO TITLE AT "+secondary_URL)
                    Labels.config(text = "Failed to extract title")
                news_body = news_soup.findAll("div", {"id":"article-body"})

                print("\n TITLE: " + news_title)
                f.write("\n \n" + news_title + "\n")
                print("Now translating...")
                translatedQuery = translate(news_title, "ru", "en")
                t.write("\n \n" + translatedQuery + "\n")
                paragraphs = news_body[0].findAll("p")
                print("Title Recorded!")
                local_progress['value'] = 10
                y = len(paragraphs)
                x = 0
                fullText = ""
                fullText2 = ""
                for paragraph in paragraphs:

                    x = x + 1
                    local_progress['value'] = x * 100 / y + 10
                    stringx = str(x)         
                    Labels.config(text = "Getting paragraph " + stringx + "...")
                    print(paragraph.text + "\n \n \n")
                    if x >= y/2:
                        fullText2 = fullText2 + paragraph.text.strip()
                    else:
                        fullText = fullText + paragraph.text.strip()
                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
                    print("Writing Paragraph " + stringx + "...")
                    if self.needToSkip:
                        break
                    
                if self.needToSkip:
                    self.needToSkip = False
                    continue
                translatedQuery = translate(fullText, "ru", "en")
                completeText = translatedQuery
                translatedQuery = translate(fullText2, "ru", "en")
                completeText = completeText + translatedQuery
                f.write("\n" + fullText + fullText2)
                t.write("\n" + completeText)
                news_picture = news_soup.findAll("source", {"class":"hero-image"})
                Labels.config(text = "Getting image...")
                if len(news_picture) > 0:
                    article_pic = news_picture[0].get("data-original-mos")
                    Labels.config(text = "Picture recieved!")
                else:
                    print("\n THIS ARTICLE HAS NO PICTURE! ")
                    Labels.config(text = "Failed to locate picture :(")
                local_progress['value'] = 120
                f.write("\n PICTURE URL: " + article_pic)
                t.write("\n PICTURE URL: " + article_pic)
                if self.stop_threads.is_set():
                    print("I SURRENDER!")
                    self.stopped = True
                    f.close()
                    t.close()
                    self.CloseLabel.config(text = "you may close now")
                    sys.exit() 
                    self.CloseLabel.config(text = "I tried, I failed")
                    break
                else:
                    print("NOTHING IS STOPPING ME!")
                    Labels.config(text = "Finished the article!")
            #brand = divWithInfo.div.a.img["title"]
            #title_container = divWithInfo.find("a", "item-title")
            #product_name = title_container.text
            #shipping_container = divWithInfo.find("li", "price-ship")
            #shipping_cost = shipping_container.text.strip()

            #print("brand:"+brand)
            #print("name:"+product_name)
            #print("shipping:"+shipping_cost)
            #print("\n")

            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
            Labels.config(text = "All Done!")
            f.close()
            t.close()
        else: 
            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
            progress['value'] = 0
            progress.pack(side = TOP) 
            Labels = Label(topFrame, text = "SCRAPING")
            Labels.pack(side = TOP)
            texts = "change"
            main_url = 'https://www.theverge.com/tech'
            uClient = uReq(main_url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
            Articles = len(containers)
            filename = self.directory + "/News.txt"
            trans_filename = self.directory + "/TranslatedNews.txt"
            f = io.open(filename, "w", encoding="utf-8")
            f.write("ACTIVE")
            t = io.open(trans_filename, "w", encoding ="utf-8")
            t.write("ACTIVE")
            Labels.config(text = "setting file!")
            i = 0
            CurrentTitle = Label(topFrame, text = "Preparing...")
            CurrentTitle.pack(side = TOP)
            for container in containers:
                i = i + 1 
                Labels.config(text = "jumping to URL!")
                print(container["class"])
                if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
                    print("\n WE'VE CATCHED A BUG!")
                    continue
                if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
                    print(container.div["class"])
                    continue
                progress['value'] = i * 100 / Articles
                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
                local_progress['value'] = 0
                local_progress.pack(side = BOTTOM)
                requiredURL = container.div.a["href"]
                secondary_URL = requiredURL
                print("Set target URL!")
                secClient = uReq(secondary_URL)
                news_html = secClient.read()
                secClient.close()
                news_soup = soup(news_html, "html.parser")
                news_soup.decode('utf-8', 'ignore')
                news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
                if len(news_containers)>0:
                    news_title = news_containers[0].h1.text
                    CurrentTitle.config(text = news_title)
                    Labels.config(text = "Extracted Title!")
                else:
                    print("ERROR! NO TITLE AT "+secondary_URL)
                    Labels.config(text = "Failed to extract title")
                news_body = news_soup.findAll("div", {"class":"c-entry-content"})
                print("\n TITLE: " + news_title)
                f.write("\n \n" + news_title + "\n")
                print("Now translating...")
                translatedQuery = translate(news_title, "ru", "en")
                t.write("\n \n" + translatedQuery + "\n")
                paragraphs = news_body[0].findAll("p")
                print("Title Recorded!")
                local_progress['value'] = 10
                y = len(paragraphs)
                x = 0
                fullText = ""
                fullText2 = ""
                for paragraph in paragraphs:

                    x = x + 1
                    local_progress['value'] = x * 100 / y + 10
                    stringx = str(x)         
                    Labels.config(text = "Getting paragraph " + stringx + "...")
                    print(paragraph.text + "\n \n \n")
                    if x >= y/2:
                        fullText2 = fullText2 + paragraph.text.strip()
                    else:
                        fullText = fullText + paragraph.text.strip()
                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
                    print("Writing Paragraph " + stringx + "...")
                    if self.needToSkip:
                        break
                    
                if self.needToSkip:
                    self.needToSkip = False
                    continue
                translatedQuery = translate(fullText, "ru", "en")
                completeText = translatedQuery
                translatedQuery = translate(fullText2, "ru", "en")
                completeText = completeText + translatedQuery
                f.write("\n" + fullText + fullText2)
                t.write("\n" + completeText)
                news_picture = news_soup.findAll("picture", {"class":"c-picture"})
                Labels.config(text = "Getting image...")
                if news_picture[0].img != None:
                    article_pic = news_picture[0].img.get("src")
                    Labels.config(text = "Picture recieved!")
                else:
                    print("\n THIS ARTICLE HAS NO PICTURE! ")
                    Labels.config(text = "Failed to locate picture :(")
                local_progress['value'] = 120
                f.write("\n PICTURE URL: " + article_pic)
                t.write("\n PICTURE URL: " + article_pic)
                if self.stop_threads.is_set():
                    print("I SURRENDER!")
                    self.stopped = True
                    f.close()
                    t.close()
                    self.CloseLabel.config(text = "you may close now")
                    sys.exit() 
                    self.CloseLabel.config(text = "I tried, I failed")
                    break
                else:
                    print("NOTHING IS STOPPING ME!")
                    Labels.config(text = "Finished the article!")
            #brand = divWithInfo.div.a.img["title"]
            #title_container = divWithInfo.find("a", "item-title")
            #product_name = title_container.text
            #shipping_container = divWithInfo.find("li", "price-ship")
            #shipping_cost = shipping_container.text.strip()

            #print("brand:"+brand)
            #print("name:"+product_name)
            #print("shipping:"+shipping_cost)
            #print("\n")

            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
            Labels.config(text = "All Done!")
            f.close()
            t.close()
texts = "VERGE SCRAPPER"
root = Tk()
program = Scrapers()
mainT = Thread(target=program.start_now)
try:
    texts
except NameError:
    theLabel = Label(root, text = "VERGE SCRAPER")
    theLabel.pack()
    print("NO TEXTS!")
else:
    theLabel = Label(root, text = texts)
    theLabel.pack()
    print("FOUND TEXTS!")

stop_thread = False
topFrame = Frame(root)
topFrame.pack()
bottomFrame = Frame(root)
bottomFrame.pack(side=BOTTOM)
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
button3.pack(side = TOP)
button1.pack(side= TOP)
button4.pack(side= TOP)
button2.pack(side = TOP)
root.mainloop()