added techradar

6 年之前 · ccc905e351
--- a/GOOGLE.pyw
+++ b/GOOGLE.pyw
@@ -15,10 +15,20 @@ from threading import *
 import os
 from tkinter import messagebox
 desktop = expanduser("~/Documents")
 agency = "verge"
 def chooseDirectory():
    currdir = os.getcwd()
    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
    program.directory = tempdir
 def switchAgencies(agencies):
    print("called Agencies")
    if agencies == "verge":
        print("switching to techradar")
        agencies = "techradar"
    else:
        print("switching to verge")
        agencies = "verge"
    button4['text'] = agencies
 class Scrapers(object):
    def __init__(self):
        self.thread1 = None
@@ -63,136 +73,277 @@ class Scrapers(object):
            self.thread1.start()
            threadActive = 1
    def start_now(self):
        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
        progress['value'] = 0
        progress.pack(side = TOP) 
        Labels = Label(topFrame, text = "SCRAPING")
        Labels.pack(side = TOP)
        texts = "change"
        main_url = 'https://www.theverge.com/tech'
        uClient = uReq(main_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
        Articles = len(containers)
        filename = self.directory + "/News.txt"
        trans_filename = self.directory + "/TranslatedNews.txt"
        f = io.open(filename, "w", encoding="utf-8")
        f.write("ACTIVE")
        t = io.open(trans_filename, "w", encoding ="utf-8")
        t.write("ACTIVE")
        Labels.config(text = "setting file!")
        i = 0
        CurrentTitle = Label(topFrame, text = "Preparing...")
        CurrentTitle.pack(side = TOP)
        for container in containers:
          i = i + 1 
          Labels.config(text = "jumping to URL!")
          print(container["class"])
          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
             print("\n WE'VE CATCHED A BUG!")
             continue
          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
             print(container.div["class"])
             continue
          progress['value'] = i * 100 / Articles
          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
          local_progress['value'] = 0
          local_progress.pack(side = BOTTOM)
          requiredURL = container.div.a["href"]
          secondary_URL = requiredURL
          print("Set target URL!")
          secClient = uReq(secondary_URL)
          news_html = secClient.read()
          secClient.close()
          news_soup = soup(news_html, "html.parser")
          news_soup.decode('utf-8', 'ignore')
          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
          if len(news_containers)>0:
             news_title = news_containers[0].h1.text
             CurrentTitle.config(text = news_title)
             Labels.config(text = "Extracted Title!")
          else:
             print("ERROR! NO TITLE AT "+secondary_URL)
             Labels.config(text = "Failed to extract title")
          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
          print("\n TITLE: " + news_title)
          f.write("\n \n" + news_title + "\n")
          print("Now translating...")
          translatedQuery = translate(news_title, "ru", "en")
          t.write("\n \n" + translatedQuery + "\n")
          paragraphs = news_body[0].findAll("p")
          print("Title Recorded!")
          local_progress['value'] = 10
          y = len(paragraphs)
          x = 0
          fullText = ""
          fullText2 = ""
          for paragraph in paragraphs:
        print("Getting" + button4['text'])
        if button4['text'] == "techradar":
            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
            progress['value'] = 0
            progress.pack(side = TOP) 
            Labels = Label(topFrame, text = "SCRAPING")
            Labels.pack(side = TOP)
            texts = "change"
            main_url = 'https://www.techradar.com/news'
            uClient = uReq(main_url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containers = page_soup.findAll("div",{"class":"listingResult"})
            
            Articles = len(containers)
            print(Articles)
            filename = self.directory + "/News.txt"
            trans_filename = self.directory + "/TranslatedNews.txt"
            f = io.open(filename, "w", encoding="utf-8")
            f.write("ACTIVE")
            t = io.open(trans_filename, "w", encoding ="utf-8")
            t.write("ACTIVE")
            Labels.config(text = "setting file!")
            i = 0
            CurrentTitle = Label(topFrame, text = "Preparing...")
            CurrentTitle.pack(side = TOP)
            for container in containers:
                
                
           
                i = i + 1 
                Labels.config(text = "jumping to URL!")
                print(container["class"])
                if 'sponsored-post' in container["class"]:
                    print("\n WE'VE CATCHED AN AD!")
                    continue
                progress['value'] = i * 100 / Articles
                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
                local_progress['value'] = 0
                local_progress.pack(side = BOTTOM)
                requiredURL = container.a["href"]
                secondary_URL = requiredURL
                print("Set target URL!" + requiredURL)
                secClient = uReq(secondary_URL)
                news_html = secClient.read()
                secClient.close()
                news_soup = soup(news_html, "html.parser")
                news_soup.decode('utf-8', 'ignore')
                squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
                print(len(squash))
                if len(squash)>0:
                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
                    print(container.div["class"])
                    continue
                news_containers = news_soup.findAll("header")
                if len(news_containers)>0:
                    news_title = news_containers[0].h1.text
                    CurrentTitle.config(text = news_title)
                    Labels.config(text = "Extracted Title!")
                else:
                    print("ERROR! NO TITLE AT "+secondary_URL)
                    Labels.config(text = "Failed to extract title")
                news_body = news_soup.findAll("div", {"id":"article-body"})

               x = x + 1
               local_progress['value'] = x * 100 / y + 10
               stringx = str(x)         
               Labels.config(text = "Getting paragraph " + stringx + "...")
               print(paragraph.text + "\n \n \n")
               if x >= y/2:
                   fullText2 = fullText2 + paragraph.text.strip()
               else:
                   fullText = fullText + paragraph.text.strip()
               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
               print("Writing Paragraph " + stringx + "...")
               if self.needToSkip:
                break
               
          if self.needToSkip:
            self.needToSkip = False
            continue
          translatedQuery = translate(fullText, "ru", "en")
          completeText = translatedQuery
          translatedQuery = translate(fullText2, "ru", "en")
          completeText = completeText + translatedQuery
          f.write("\n" + fullText + fullText2)
          t.write("\n" + completeText)
          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
          Labels.config(text = "Getting image...")
          if news_picture[0].img != None:
             article_pic = news_picture[0].img.get("src")
             Labels.config(text = "Picture recieved!")
          else:
             print("\n THIS ARTICLE HAS NO PICTURE! ")
             Labels.config(text = "Failed to locate picture :(")
          local_progress['value'] = 120
          f.write("\n PICTURE URL: " + article_pic)
          t.write("\n PICTURE URL: " + article_pic)
          if self.stop_threads.is_set():
            print("I SURRENDER!")
            self.stopped = True
                print("\n TITLE: " + news_title)
                f.write("\n \n" + news_title + "\n")
                print("Now translating...")
                translatedQuery = translate(news_title, "ru", "en")
                t.write("\n \n" + translatedQuery + "\n")
                paragraphs = news_body[0].findAll("p")
                print("Title Recorded!")
                local_progress['value'] = 10
                y = len(paragraphs)
                x = 0
                fullText = ""
                fullText2 = ""
                for paragraph in paragraphs:

                    x = x + 1
                    local_progress['value'] = x * 100 / y + 10
                    stringx = str(x)         
                    Labels.config(text = "Getting paragraph " + stringx + "...")
                    print(paragraph.text + "\n \n \n")
                    if x >= y/2:
                        fullText2 = fullText2 + paragraph.text.strip()
                    else:
                        fullText = fullText + paragraph.text.strip()
                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
                    print("Writing Paragraph " + stringx + "...")
                    if self.needToSkip:
                        break
                    
                if self.needToSkip:
                    self.needToSkip = False
                    continue
                translatedQuery = translate(fullText, "ru", "en")
                completeText = translatedQuery
                translatedQuery = translate(fullText2, "ru", "en")
                completeText = completeText + translatedQuery
                f.write("\n" + fullText + fullText2)
                t.write("\n" + completeText)
                news_picture = news_soup.findAll("source", {"class":"hero-image"})
                Labels.config(text = "Getting image...")
                if len(news_picture) > 0:
                    article_pic = news_picture[0].get("data-original-mos")
                    Labels.config(text = "Picture recieved!")
                else:
                    print("\n THIS ARTICLE HAS NO PICTURE! ")
                    Labels.config(text = "Failed to locate picture :(")
                local_progress['value'] = 120
                f.write("\n PICTURE URL: " + article_pic)
                t.write("\n PICTURE URL: " + article_pic)
                if self.stop_threads.is_set():
                    print("I SURRENDER!")
                    self.stopped = True
                    f.close()
                    t.close()
                    self.CloseLabel.config(text = "you may close now")
                    sys.exit() 
                    self.CloseLabel.config(text = "I tried, I failed")
                    break
                else:
                    print("NOTHING IS STOPPING ME!")
                    Labels.config(text = "Finished the article!")
            #brand = divWithInfo.div.a.img["title"]
            #title_container = divWithInfo.find("a", "item-title")
            #product_name = title_container.text
            #shipping_container = divWithInfo.find("li", "price-ship")
            #shipping_cost = shipping_container.text.strip()

            #print("brand:"+brand)
            #print("name:"+product_name)
            #print("shipping:"+shipping_cost)
            #print("\n")

            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
            Labels.config(text = "All Done!")
            f.close()
            t.close()
            self.CloseLabel.config(text = "you may close now")
            sys.exit() 
            self.CloseLabel.config(text = "I tried, I failed")
            break
          else:
            print("NOTHING IS STOPPING ME!")
            Labels.config(text = "Finished the article!")
        #brand = divWithInfo.div.a.img["title"]
        #title_container = divWithInfo.find("a", "item-title")
        #product_name = title_container.text
        #shipping_container = divWithInfo.find("li", "price-ship")
        #shipping_cost = shipping_container.text.strip()
        else: 
            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
            progress['value'] = 0
            progress.pack(side = TOP) 
            Labels = Label(topFrame, text = "SCRAPING")
            Labels.pack(side = TOP)
            texts = "change"
            main_url = 'https://www.theverge.com/tech'
            uClient = uReq(main_url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
            Articles = len(containers)
            filename = self.directory + "/News.txt"
            trans_filename = self.directory + "/TranslatedNews.txt"
            f = io.open(filename, "w", encoding="utf-8")
            f.write("ACTIVE")
            t = io.open(trans_filename, "w", encoding ="utf-8")
            t.write("ACTIVE")
            Labels.config(text = "setting file!")
            i = 0
            CurrentTitle = Label(topFrame, text = "Preparing...")
            CurrentTitle.pack(side = TOP)
            for container in containers:
                i = i + 1 
                Labels.config(text = "jumping to URL!")
                print(container["class"])
                if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
                    print("\n WE'VE CATCHED A BUG!")
                    continue
                if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
                    print(container.div["class"])
                    continue
                progress['value'] = i * 100 / Articles
                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
                local_progress['value'] = 0
                local_progress.pack(side = BOTTOM)
                requiredURL = container.div.a["href"]
                secondary_URL = requiredURL
                print("Set target URL!")
                secClient = uReq(secondary_URL)
                news_html = secClient.read()
                secClient.close()
                news_soup = soup(news_html, "html.parser")
                news_soup.decode('utf-8', 'ignore')
                news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
                if len(news_containers)>0:
                    news_title = news_containers[0].h1.text
                    CurrentTitle.config(text = news_title)
                    Labels.config(text = "Extracted Title!")
                else:
                    print("ERROR! NO TITLE AT "+secondary_URL)
                    Labels.config(text = "Failed to extract title")
                news_body = news_soup.findAll("div", {"class":"c-entry-content"})
                print("\n TITLE: " + news_title)
                f.write("\n \n" + news_title + "\n")
                print("Now translating...")
                translatedQuery = translate(news_title, "ru", "en")
                t.write("\n \n" + translatedQuery + "\n")
                paragraphs = news_body[0].findAll("p")
                print("Title Recorded!")
                local_progress['value'] = 10
                y = len(paragraphs)
                x = 0
                fullText = ""
                fullText2 = ""
                for paragraph in paragraphs:

        #print("brand:"+brand)
        #print("name:"+product_name)
        #print("shipping:"+shipping_cost)
        #print("\n")
                    x = x + 1
                    local_progress['value'] = x * 100 / y + 10
                    stringx = str(x)         
                    Labels.config(text = "Getting paragraph " + stringx + "...")
                    print(paragraph.text + "\n \n \n")
                    if x >= y/2:
                        fullText2 = fullText2 + paragraph.text.strip()
                    else:
                        fullText = fullText + paragraph.text.strip()
                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
                    print("Writing Paragraph " + stringx + "...")
                    if self.needToSkip:
                        break
                    
                if self.needToSkip:
                    self.needToSkip = False
                    continue
                translatedQuery = translate(fullText, "ru", "en")
                completeText = translatedQuery
                translatedQuery = translate(fullText2, "ru", "en")
                completeText = completeText + translatedQuery
                f.write("\n" + fullText + fullText2)
                t.write("\n" + completeText)
                news_picture = news_soup.findAll("picture", {"class":"c-picture"})
                Labels.config(text = "Getting image...")
                if news_picture[0].img != None:
                    article_pic = news_picture[0].img.get("src")
                    Labels.config(text = "Picture recieved!")
                else:
                    print("\n THIS ARTICLE HAS NO PICTURE! ")
                    Labels.config(text = "Failed to locate picture :(")
                local_progress['value'] = 120
                f.write("\n PICTURE URL: " + article_pic)
                t.write("\n PICTURE URL: " + article_pic)
                if self.stop_threads.is_set():
                    print("I SURRENDER!")
                    self.stopped = True
                    f.close()
                    t.close()
                    self.CloseLabel.config(text = "you may close now")
                    sys.exit() 
                    self.CloseLabel.config(text = "I tried, I failed")
                    break
                else:
                    print("NOTHING IS STOPPING ME!")
                    Labels.config(text = "Finished the article!")
            #brand = divWithInfo.div.a.img["title"]
            #title_container = divWithInfo.find("a", "item-title")
            #product_name = title_container.text
            #shipping_container = divWithInfo.find("li", "price-ship")
            #shipping_cost = shipping_container.text.strip()

        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
        Labels.config(text = "All Done!")
        f.close()
        t.close()
            #print("brand:"+brand)
            #print("name:"+product_name)
            #print("shipping:"+shipping_cost)
            #print("\n")

            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
            Labels.config(text = "All Done!")
            f.close()
            t.close()
 texts = "VERGE SCRAPPER"
 root = Tk()
 program = Scrapers()
@@ -214,9 +365,11 @@ topFrame.pack()
 bottomFrame = Frame(root)
 bottomFrame.pack(side=BOTTOM)
 button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
 button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
 button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
 button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
 button3.pack(side = TOP)
 button1.pack(side= TOP)
 button4.pack(side= TOP)
 button2.pack(side = TOP)
 root.mainloop()