updated scraper and added picker
This commit is contained in:
88
hello-fresh-scraper.py
Normal file
88
hello-fresh-scraper.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
from string import ascii_lowercase
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common import keys
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
url_base = "https://www.hellofresh.com/pages/sitemap/recipes-"
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_experimental_option("prefs", {
|
||||||
|
"download.default_directory": "E:/Nico/Documents/GitHub/hello-fresh-scraper/",
|
||||||
|
"download.prompt_for_download": False,
|
||||||
|
"download.directory_upgrade": True,
|
||||||
|
"safebrowsing_for_trusted_sources_enabled": False,
|
||||||
|
"safebrowsing.enabled": False,
|
||||||
|
"plugins.always_open_pdf_externally": True,
|
||||||
|
"excludeSwitches": ["disable-popup-blocking"]
|
||||||
|
})
|
||||||
|
#chrome_options.set_capability("excludedSwitches","disable-popup-blocking")
|
||||||
|
#driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||||
|
driver = webdriver.Firefox();
|
||||||
|
# function to take care of downloading file
|
||||||
|
def enable_download_headless(browser,download_dir):
|
||||||
|
browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
|
||||||
|
params = {'cmd':'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
|
||||||
|
browser.execute("send_command", params)
|
||||||
|
download_dir = "E:/Nico/Documents/GitHub/hello-fresh-scraper/"
|
||||||
|
#enable_download_headless(driver, download_dir)
|
||||||
|
recipes = {}
|
||||||
|
with open("E:/Nico/Documents/HelloFresh/recipes.json", "r", encoding='utf8') as file:
|
||||||
|
recipes = json.load(file)
|
||||||
|
for x in ["s2"]: #ascii_lowercase:
|
||||||
|
print("Going to: {}{}".format(url_base, x))
|
||||||
|
driver.get(url_base + x)
|
||||||
|
# Setup wait for later
|
||||||
|
wait = WebDriverWait(driver, 10)
|
||||||
|
|
||||||
|
# Store the ID of the original window
|
||||||
|
original_window = driver.current_window_handle
|
||||||
|
|
||||||
|
# Check we don't have other windows open already
|
||||||
|
assert len(driver.window_handles) == 1
|
||||||
|
topdiv = driver.find_elements_by_class_name("css-6ht6q2")[1]
|
||||||
|
rows = topdiv.find_elements_by_tag_name("tr")
|
||||||
|
for row in rows:
|
||||||
|
|
||||||
|
col = row.find_element_by_tag_name("td")
|
||||||
|
destFile = col.text.split(" ")
|
||||||
|
destFile = "_".join(destFile[:-1])
|
||||||
|
print(destFile)
|
||||||
|
link = col.find_element_by_tag_name("a")
|
||||||
|
recipes[destFile] = link.get_attribute('href')
|
||||||
|
#link.send_keys(Keys.CONTROL, Keys.RETURN)
|
||||||
|
#wait = WebDriverWait(driver, 10)
|
||||||
|
"""page = driver.window_handles
|
||||||
|
for w in page:
|
||||||
|
if (w!=original_window):
|
||||||
|
driver.switch_to.window(w)
|
||||||
|
#wait = WebDriverWait(driver, 10)
|
||||||
|
break
|
||||||
|
recipediv = driver.find_element_by_class_name("fela-_1s0c67o")
|
||||||
|
|
||||||
|
pdflink = recipediv.find_element_by_tag_name("a")
|
||||||
|
url = pdflink.get_attribute('href')
|
||||||
|
print(pdflink.get_attribute('href'))
|
||||||
|
|
||||||
|
urllib.request.urlretrieve(url,"E:/Nico/Documents/HelloFresh/" + destFile + ".pdf")
|
||||||
|
#time.sleep(3)
|
||||||
|
print("going back")
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
driver.switch_to.window(original_window)
|
||||||
|
try:
|
||||||
|
driver.find_element_by_link_text("decline offer").click()
|
||||||
|
print("-----------CLICKED DECLINE OFFER---------------")
|
||||||
|
except:
|
||||||
|
pass"""
|
||||||
|
#time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
#print(recipes)
|
||||||
|
with open("E:/Nico/Documents/HelloFresh/recipes.json", "w",encoding='utf8') as file:
|
||||||
|
json.dump(recipes,file, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from string import ascii_lowercase
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.common.keys import Keys
|
|
||||||
import time
|
|
||||||
url_base = "https://www.hellofresh.com/pages/sitemap/recipes-"
|
|
||||||
|
|
||||||
driver = webdriver.Chrome()
|
|
||||||
for x in ['a']: #ascii_lowercase:
|
|
||||||
print("Going to: {}{}".format(url_base, x))
|
|
||||||
driver.get(url_base + x)
|
|
||||||
time.sleep(60)
|
|
||||||
12
recipe-picker.py
Normal file
12
recipe-picker.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
pwd = os.getcwd()
|
||||||
|
print(pwd)
|
||||||
|
recipesdict = {}
|
||||||
|
with open(pwd + "\\recipes.json", "r", encoding="utf8") as file:
|
||||||
|
recipesdict = json.load(file)
|
||||||
|
|
||||||
|
recipes = list(recipesdict.keys())
|
||||||
|
for x in range(5):
|
||||||
|
print(recipesdict[recipes[random.randint(0,len(recipes))]])
|
||||||
2168
recipes.json
Normal file
2168
recipes.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user