from string import ascii_lowercase from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common import keys from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import json import time import urllib.request url_base = "https://www.hellofresh.com/pages/sitemap/recipes-" chrome_options = Options() chrome_options.add_experimental_option("prefs", { "download.default_directory": "E:/Nico/Documents/GitHub/hello-fresh-scraper/", "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing_for_trusted_sources_enabled": False, "safebrowsing.enabled": False, "plugins.always_open_pdf_externally": True, "excludeSwitches": ["disable-popup-blocking"] }) #chrome_options.set_capability("excludedSwitches","disable-popup-blocking") #driver = webdriver.Chrome(chrome_options=chrome_options) driver = webdriver.Firefox(); # function to take care of downloading file def enable_download_headless(browser,download_dir): browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') params = {'cmd':'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}} browser.execute("send_command", params) download_dir = "E:/Nico/Documents/GitHub/hello-fresh-scraper/" #enable_download_headless(driver, download_dir) recipes = {} with open("E:/Nico/Documents/HelloFresh/recipes.json", "r", encoding='utf8') as file: recipes = json.load(file) for x in ["s2"]: #ascii_lowercase: print("Going to: {}{}".format(url_base, x)) driver.get(url_base + x) # Setup wait for later wait = WebDriverWait(driver, 10) # Store the ID of the original window original_window = driver.current_window_handle # Check we don't have other windows open already assert len(driver.window_handles) == 1 topdiv = driver.find_elements_by_class_name("css-6ht6q2")[1] rows = topdiv.find_elements_by_tag_name("tr") for row in rows: col = row.find_element_by_tag_name("td") destFile = col.text.split(" ") destFile = "_".join(destFile[:-1]) print(destFile) link = col.find_element_by_tag_name("a") recipes[destFile] = link.get_attribute('href') #link.send_keys(Keys.CONTROL, Keys.RETURN) #wait = WebDriverWait(driver, 10) """page = driver.window_handles for w in page: if (w!=original_window): driver.switch_to.window(w) #wait = WebDriverWait(driver, 10) break recipediv = driver.find_element_by_class_name("fela-_1s0c67o") pdflink = recipediv.find_element_by_tag_name("a") url = pdflink.get_attribute('href') print(pdflink.get_attribute('href')) urllib.request.urlretrieve(url,"E:/Nico/Documents/HelloFresh/" + destFile + ".pdf") #time.sleep(3) print("going back") driver.close() driver.switch_to.window(original_window) try: driver.find_element_by_link_text("decline offer").click() print("-----------CLICKED DECLINE OFFER---------------") except: pass""" #time.sleep(10) driver.quit() #print(recipes) with open("E:/Nico/Documents/HelloFresh/recipes.json", "w",encoding='utf8') as file: json.dump(recipes,file, indent=4, ensure_ascii=False, sort_keys=True)