updated scraper and added picker

2021-10-03 15:18:16 -05:00
parent 585108b3cf
commit 26662abf36
4 changed files with 2268 additions and 11 deletions
--- a/hello-fresh-scraper.py
+++ b/hello-fresh-scraper.py
@@ -0,0 +1,88 @@
+from string import ascii_lowercase
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common import keys
+from selenium.webdriver.common.keys import Keys
+
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import json
+import time
+import urllib.request
+url_base = "https://www.hellofresh.com/pages/sitemap/recipes-"
+chrome_options = Options()
+chrome_options.add_experimental_option("prefs", {
+        "download.default_directory": "E:/Nico/Documents/GitHub/hello-fresh-scraper/",
+        "download.prompt_for_download": False,
+        "download.directory_upgrade": True,
+        "safebrowsing_for_trusted_sources_enabled": False,
+        "safebrowsing.enabled": False,
+        "plugins.always_open_pdf_externally": True,
+        "excludeSwitches": ["disable-popup-blocking"]
+})
+#chrome_options.set_capability("excludedSwitches","disable-popup-blocking")
+#driver = webdriver.Chrome(chrome_options=chrome_options)
+driver = webdriver.Firefox();
+# function to take care of downloading file
+def enable_download_headless(browser,download_dir):
+    browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
+    params = {'cmd':'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
+    browser.execute("send_command", params)
+download_dir = "E:/Nico/Documents/GitHub/hello-fresh-scraper/"
+#enable_download_headless(driver, download_dir)
+recipes = {}
+with open("E:/Nico/Documents/HelloFresh/recipes.json", "r", encoding='utf8') as file:
+    recipes = json.load(file)
+for x in ["s2"]: #ascii_lowercase:
+    print("Going to: {}{}".format(url_base, x))
+    driver.get(url_base + x)
+    # Setup wait for later
+    wait = WebDriverWait(driver, 10)
+
+    # Store the ID of the original window
+    original_window = driver.current_window_handle
+
+    # Check we don't have other windows open already
+    assert len(driver.window_handles) == 1
+    topdiv = driver.find_elements_by_class_name("css-6ht6q2")[1]
+    rows = topdiv.find_elements_by_tag_name("tr")
+    for row in rows:
+       
+        col = row.find_element_by_tag_name("td")
+        destFile = col.text.split(" ")
+        destFile = "_".join(destFile[:-1])
+        print(destFile)
+        link = col.find_element_by_tag_name("a")
+        recipes[destFile] = link.get_attribute('href')
+        #link.send_keys(Keys.CONTROL, Keys.RETURN)
+        #wait = WebDriverWait(driver, 10)
+        """page = driver.window_handles
+        for w in page:
+            if (w!=original_window):
+                driver.switch_to.window(w)
+                #wait = WebDriverWait(driver, 10)
+                break
+        recipediv = driver.find_element_by_class_name("fela-_1s0c67o")
+               
+        pdflink = recipediv.find_element_by_tag_name("a")
+        url = pdflink.get_attribute('href')
+        print(pdflink.get_attribute('href'))
+        
+        urllib.request.urlretrieve(url,"E:/Nico/Documents/HelloFresh/" + destFile + ".pdf")
+        #time.sleep(3)
+        print("going back")
+        
+        driver.close()
+        driver.switch_to.window(original_window)
+        try:
+            driver.find_element_by_link_text("decline offer").click()
+            print("-----------CLICKED DECLINE OFFER---------------")
+        except:
+            pass"""
+    #time.sleep(10)
+    
+    
+driver.quit()
+#print(recipes)
+with open("E:/Nico/Documents/HelloFresh/recipes.json", "w",encoding='utf8') as file:
+    json.dump(recipes,file, indent=4, ensure_ascii=False, sort_keys=True)
--- a/hello-fresh.py
+++ b/hello-fresh.py
@@ -1,11 +0,0 @@
-from string import ascii_lowercase
-from selenium import webdriver
-from selenium.webdriver.common.keys import Keys
-import time
-url_base = "https://www.hellofresh.com/pages/sitemap/recipes-"
-
-driver = webdriver.Chrome()
-for x in ['a']: #ascii_lowercase:
-    print("Going to: {}{}".format(url_base, x))
-    driver.get(url_base + x)
-    time.sleep(60)
--- a/recipe-picker.py
+++ b/recipe-picker.py
@@ -0,0 +1,12 @@
+import json
+import os
+import random
+pwd = os.getcwd()
+print(pwd)
+recipesdict = {}
+with open(pwd + "\\recipes.json", "r", encoding="utf8") as file:
+    recipesdict = json.load(file)
+
+recipes = list(recipesdict.keys())
+for x in range(5):
+    print(recipesdict[recipes[random.randint(0,len(recipes))]])
--- a/recipes.json
+++ b/recipes.json