From ccc905e35191162f6396431b3c55c564cf81b7fe Mon Sep 17 00:00:00 2001
From: madiwka3 <madiwka3@madi>
Date: Tue, 7 Jul 2020 12:11:45 +0600
Subject: [PATCH] added techradar

---
 GOOGLE.pyw | 403 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 278 insertions(+), 125 deletions(-)

diff --git a/GOOGLE.pyw b/GOOGLE.pyw
index 99eef0d..c953fb0 100755
--- a/GOOGLE.pyw
+++ b/GOOGLE.pyw
@@ -15,10 +15,20 @@ from threading import *
 import os
 from tkinter import messagebox
 desktop = expanduser("~/Documents")
+agency = "verge"
 def chooseDirectory():
     currdir = os.getcwd()
     tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
     program.directory = tempdir
+def switchAgencies(agencies):
+    print("called Agencies")
+    if agencies == "verge":
+        print("switching to techradar")
+        agencies = "techradar"
+    else:
+        print("switching to verge")
+        agencies = "verge"
+    button4['text'] = agencies
 class Scrapers(object):
     def __init__(self):
         self.thread1 = None
@@ -63,136 +73,277 @@ class Scrapers(object):
             self.thread1.start()
             threadActive = 1
     def start_now(self):
-        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
-        progress['value'] = 0
-        progress.pack(side = TOP) 
-        Labels = Label(topFrame, text = "SCRAPING")
-        Labels.pack(side = TOP)
-        texts = "change"
-        main_url = 'https://www.theverge.com/tech'
-        uClient = uReq(main_url)
-        page_html = uClient.read()
-        uClient.close()
-        page_soup = soup(page_html, "html.parser")
-        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
-        Articles = len(containers)
-        filename = self.directory + "/News.txt"
-        trans_filename = self.directory + "/TranslatedNews.txt"
-        f = io.open(filename, "w", encoding="utf-8")
-        f.write("ACTIVE")
-        t = io.open(trans_filename, "w", encoding ="utf-8")
-        t.write("ACTIVE")
-        Labels.config(text = "setting file!")
-        i = 0
-        CurrentTitle = Label(topFrame, text = "Preparing...")
-        CurrentTitle.pack(side = TOP)
-        for container in containers:
-          i = i + 1 
-          Labels.config(text = "jumping to URL!")
-          print(container["class"])
-          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
-             print("\n WE'VE CATCHED A BUG!")
-             continue
-          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
-             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
-             print(container.div["class"])
-             continue
-          progress['value'] = i * 100 / Articles
-          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
-          local_progress['value'] = 0
-          local_progress.pack(side = BOTTOM)
-          requiredURL = container.div.a["href"]
-          secondary_URL = requiredURL
-          print("Set target URL!")
-          secClient = uReq(secondary_URL)
-          news_html = secClient.read()
-          secClient.close()
-          news_soup = soup(news_html, "html.parser")
-          news_soup.decode('utf-8', 'ignore')
-          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
-          if len(news_containers)>0:
-             news_title = news_containers[0].h1.text
-             CurrentTitle.config(text = news_title)
-             Labels.config(text = "Extracted Title!")
-          else:
-             print("ERROR! NO TITLE AT "+secondary_URL)
-             Labels.config(text = "Failed to extract title")
-          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
-          print("\n TITLE: " + news_title)
-          f.write("\n \n" + news_title + "\n")
-          print("Now translating...")
-          translatedQuery = translate(news_title, "ru", "en")
-          t.write("\n \n" + translatedQuery + "\n")
-          paragraphs = news_body[0].findAll("p")
-          print("Title Recorded!")
-          local_progress['value'] = 10
-          y = len(paragraphs)
-          x = 0
-          fullText = ""
-          fullText2 = ""
-          for paragraph in paragraphs:
+        print("Getting" + button4['text'])
+        if button4['text'] == "techradar":
+            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
+            progress['value'] = 0
+            progress.pack(side = TOP) 
+            Labels = Label(topFrame, text = "SCRAPING")
+            Labels.pack(side = TOP)
+            texts = "change"
+            main_url = 'https://www.techradar.com/news'
+            uClient = uReq(main_url)
+            page_html = uClient.read()
+            uClient.close()
+            page_soup = soup(page_html, "html.parser")
+            containers = page_soup.findAll("div",{"class":"listingResult"})
+            
+            Articles = len(containers)
+            print(Articles)
+            filename = self.directory + "/News.txt"
+            trans_filename = self.directory + "/TranslatedNews.txt"
+            f = io.open(filename, "w", encoding="utf-8")
+            f.write("ACTIVE")
+            t = io.open(trans_filename, "w", encoding ="utf-8")
+            t.write("ACTIVE")
+            Labels.config(text = "setting file!")
+            i = 0
+            CurrentTitle = Label(topFrame, text = "Preparing...")
+            CurrentTitle.pack(side = TOP)
+            for container in containers:
+                
+                
+           
+                i = i + 1 
+                Labels.config(text = "jumping to URL!")
+                print(container["class"])
+                if 'sponsored-post' in container["class"]:
+                    print("\n WE'VE CATCHED AN AD!")
+                    continue
+                progress['value'] = i * 100 / Articles
+                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
+                local_progress['value'] = 0
+                local_progress.pack(side = BOTTOM)
+                requiredURL = container.a["href"]
+                secondary_URL = requiredURL
+                print("Set target URL!" + requiredURL)
+                secClient = uReq(secondary_URL)
+                news_html = secClient.read()
+                secClient.close()
+                news_soup = soup(news_html, "html.parser")
+                news_soup.decode('utf-8', 'ignore')
+                squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
+                print(len(squash))
+                if len(squash)>0:
+                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
+                    print(container.div["class"])
+                    continue
+                news_containers = news_soup.findAll("header")
+                if len(news_containers)>0:
+                    news_title = news_containers[0].h1.text
+                    CurrentTitle.config(text = news_title)
+                    Labels.config(text = "Extracted Title!")
+                else:
+                    print("ERROR! NO TITLE AT "+secondary_URL)
+                    Labels.config(text = "Failed to extract title")
+                news_body = news_soup.findAll("div", {"id":"article-body"})
 
-               x = x + 1
-               local_progress['value'] = x * 100 / y + 10
-               stringx = str(x)         
-               Labels.config(text = "Getting paragraph " + stringx + "...")
-               print(paragraph.text + "\n \n \n")
-               if x >= y/2:
-                   fullText2 = fullText2 + paragraph.text.strip()
-               else:
-                   fullText = fullText + paragraph.text.strip()
-               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
-               print("Writing Paragraph " + stringx + "...")
-               if self.needToSkip:
-                break
-               
-          if self.needToSkip:
-            self.needToSkip = False
-            continue
-          translatedQuery = translate(fullText, "ru", "en")
-          completeText = translatedQuery
-          translatedQuery = translate(fullText2, "ru", "en")
-          completeText = completeText + translatedQuery
-          f.write("\n" + fullText + fullText2)
-          t.write("\n" + completeText)
-          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
-          Labels.config(text = "Getting image...")
-          if news_picture[0].img != None:
-             article_pic = news_picture[0].img.get("src")
-             Labels.config(text = "Picture recieved!")
-          else:
-             print("\n THIS ARTICLE HAS NO PICTURE! ")
-             Labels.config(text = "Failed to locate picture :(")
-          local_progress['value'] = 120
-          f.write("\n PICTURE URL: " + article_pic)
-          t.write("\n PICTURE URL: " + article_pic)
-          if self.stop_threads.is_set():
-            print("I SURRENDER!")
-            self.stopped = True
+                print("\n TITLE: " + news_title)
+                f.write("\n \n" + news_title + "\n")
+                print("Now translating...")
+                translatedQuery = translate(news_title, "ru", "en")
+                t.write("\n \n" + translatedQuery + "\n")
+                paragraphs = news_body[0].findAll("p")
+                print("Title Recorded!")
+                local_progress['value'] = 10
+                y = len(paragraphs)
+                x = 0
+                fullText = ""
+                fullText2 = ""
+                for paragraph in paragraphs:
+
+                    x = x + 1
+                    local_progress['value'] = x * 100 / y + 10
+                    stringx = str(x)         
+                    Labels.config(text = "Getting paragraph " + stringx + "...")
+                    print(paragraph.text + "\n \n \n")
+                    if x >= y/2:
+                        fullText2 = fullText2 + paragraph.text.strip()
+                    else:
+                        fullText = fullText + paragraph.text.strip()
+                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
+                    print("Writing Paragraph " + stringx + "...")
+                    if self.needToSkip:
+                        break
+                    
+                if self.needToSkip:
+                    self.needToSkip = False
+                    continue
+                translatedQuery = translate(fullText, "ru", "en")
+                completeText = translatedQuery
+                translatedQuery = translate(fullText2, "ru", "en")
+                completeText = completeText + translatedQuery
+                f.write("\n" + fullText + fullText2)
+                t.write("\n" + completeText)
+                news_picture = news_soup.findAll("source", {"class":"hero-image"})
+                Labels.config(text = "Getting image...")
+                if len(news_picture) > 0:
+                    article_pic = news_picture[0].get("data-original-mos")
+                    Labels.config(text = "Picture recieved!")
+                else:
+                    print("\n THIS ARTICLE HAS NO PICTURE! ")
+                    Labels.config(text = "Failed to locate picture :(")
+                local_progress['value'] = 120
+                f.write("\n PICTURE URL: " + article_pic)
+                t.write("\n PICTURE URL: " + article_pic)
+                if self.stop_threads.is_set():
+                    print("I SURRENDER!")
+                    self.stopped = True
+                    f.close()
+                    t.close()
+                    self.CloseLabel.config(text = "you may close now")
+                    sys.exit() 
+                    self.CloseLabel.config(text = "I tried, I failed")
+                    break
+                else:
+                    print("NOTHING IS STOPPING ME!")
+                    Labels.config(text = "Finished the article!")
+            #brand = divWithInfo.div.a.img["title"]
+            #title_container = divWithInfo.find("a", "item-title")
+            #product_name = title_container.text
+            #shipping_container = divWithInfo.find("li", "price-ship")
+            #shipping_cost = shipping_container.text.strip()
+
+            #print("brand:"+brand)
+            #print("name:"+product_name)
+            #print("shipping:"+shipping_cost)
+            #print("\n")
+
+            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
+            Labels.config(text = "All Done!")
             f.close()
             t.close()
-            self.CloseLabel.config(text = "you may close now")
-            sys.exit() 
-            self.CloseLabel.config(text = "I tried, I failed")
-            break
-          else:
-            print("NOTHING IS STOPPING ME!")
-            Labels.config(text = "Finished the article!")
-        #brand = divWithInfo.div.a.img["title"]
-        #title_container = divWithInfo.find("a", "item-title")
-        #product_name = title_container.text
-        #shipping_container = divWithInfo.find("li", "price-ship")
-        #shipping_cost = shipping_container.text.strip()
+        else: 
+            progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
+            progress['value'] = 0
+            progress.pack(side = TOP) 
+            Labels = Label(topFrame, text = "SCRAPING")
+            Labels.pack(side = TOP)
+            texts = "change"
+            main_url = 'https://www.theverge.com/tech'
+            uClient = uReq(main_url)
+            page_html = uClient.read()
+            uClient.close()
+            page_soup = soup(page_html, "html.parser")
+            containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
+            Articles = len(containers)
+            filename = self.directory + "/News.txt"
+            trans_filename = self.directory + "/TranslatedNews.txt"
+            f = io.open(filename, "w", encoding="utf-8")
+            f.write("ACTIVE")
+            t = io.open(trans_filename, "w", encoding ="utf-8")
+            t.write("ACTIVE")
+            Labels.config(text = "setting file!")
+            i = 0
+            CurrentTitle = Label(topFrame, text = "Preparing...")
+            CurrentTitle.pack(side = TOP)
+            for container in containers:
+                i = i + 1 
+                Labels.config(text = "jumping to URL!")
+                print(container["class"])
+                if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
+                    print("\n WE'VE CATCHED A BUG!")
+                    continue
+                if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
+                    print("\n WARNING! THIS IS NOT AN ARTICLE! ")
+                    print(container.div["class"])
+                    continue
+                progress['value'] = i * 100 / Articles
+                local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
+                local_progress['value'] = 0
+                local_progress.pack(side = BOTTOM)
+                requiredURL = container.div.a["href"]
+                secondary_URL = requiredURL
+                print("Set target URL!")
+                secClient = uReq(secondary_URL)
+                news_html = secClient.read()
+                secClient.close()
+                news_soup = soup(news_html, "html.parser")
+                news_soup.decode('utf-8', 'ignore')
+                news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
+                if len(news_containers)>0:
+                    news_title = news_containers[0].h1.text
+                    CurrentTitle.config(text = news_title)
+                    Labels.config(text = "Extracted Title!")
+                else:
+                    print("ERROR! NO TITLE AT "+secondary_URL)
+                    Labels.config(text = "Failed to extract title")
+                news_body = news_soup.findAll("div", {"class":"c-entry-content"})
+                print("\n TITLE: " + news_title)
+                f.write("\n \n" + news_title + "\n")
+                print("Now translating...")
+                translatedQuery = translate(news_title, "ru", "en")
+                t.write("\n \n" + translatedQuery + "\n")
+                paragraphs = news_body[0].findAll("p")
+                print("Title Recorded!")
+                local_progress['value'] = 10
+                y = len(paragraphs)
+                x = 0
+                fullText = ""
+                fullText2 = ""
+                for paragraph in paragraphs:
 
-        #print("brand:"+brand)
-        #print("name:"+product_name)
-        #print("shipping:"+shipping_cost)
-        #print("\n")
+                    x = x + 1
+                    local_progress['value'] = x * 100 / y + 10
+                    stringx = str(x)         
+                    Labels.config(text = "Getting paragraph " + stringx + "...")
+                    print(paragraph.text + "\n \n \n")
+                    if x >= y/2:
+                        fullText2 = fullText2 + paragraph.text.strip()
+                    else:
+                        fullText = fullText + paragraph.text.strip()
+                    Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
+                    print("Writing Paragraph " + stringx + "...")
+                    if self.needToSkip:
+                        break
+                    
+                if self.needToSkip:
+                    self.needToSkip = False
+                    continue
+                translatedQuery = translate(fullText, "ru", "en")
+                completeText = translatedQuery
+                translatedQuery = translate(fullText2, "ru", "en")
+                completeText = completeText + translatedQuery
+                f.write("\n" + fullText + fullText2)
+                t.write("\n" + completeText)
+                news_picture = news_soup.findAll("picture", {"class":"c-picture"})
+                Labels.config(text = "Getting image...")
+                if news_picture[0].img != None:
+                    article_pic = news_picture[0].img.get("src")
+                    Labels.config(text = "Picture recieved!")
+                else:
+                    print("\n THIS ARTICLE HAS NO PICTURE! ")
+                    Labels.config(text = "Failed to locate picture :(")
+                local_progress['value'] = 120
+                f.write("\n PICTURE URL: " + article_pic)
+                t.write("\n PICTURE URL: " + article_pic)
+                if self.stop_threads.is_set():
+                    print("I SURRENDER!")
+                    self.stopped = True
+                    f.close()
+                    t.close()
+                    self.CloseLabel.config(text = "you may close now")
+                    sys.exit() 
+                    self.CloseLabel.config(text = "I tried, I failed")
+                    break
+                else:
+                    print("NOTHING IS STOPPING ME!")
+                    Labels.config(text = "Finished the article!")
+            #brand = divWithInfo.div.a.img["title"]
+            #title_container = divWithInfo.find("a", "item-title")
+            #product_name = title_container.text
+            #shipping_container = divWithInfo.find("li", "price-ship")
+            #shipping_cost = shipping_container.text.strip()
 
-        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
-        Labels.config(text = "All Done!")
-        f.close()
-        t.close()
+            #print("brand:"+brand)
+            #print("name:"+product_name)
+            #print("shipping:"+shipping_cost)
+            #print("\n")
+
+            #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
+            Labels.config(text = "All Done!")
+            f.close()
+            t.close()
 texts = "VERGE SCRAPPER"
 root = Tk()
 program = Scrapers()
@@ -214,9 +365,11 @@ topFrame.pack()
 bottomFrame = Frame(root)
 bottomFrame.pack(side=BOTTOM)
 button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
+button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
 button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
 button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
 button3.pack(side = TOP)
 button1.pack(side= TOP)
+button4.pack(side= TOP)
 button2.pack(side = TOP)
 root.mainloop()
\ No newline at end of file