Kindle Scrape

import time

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


def take_screenshot(driver, page_num):
    div_selector = 'kg-full-page-img'
    image_locator = (By.CLASS_NAME, div_selector)  # Change to the appropriate locator for your image
    wait = WebDriverWait(driver, 10)
    div_element = wait.until(EC.presence_of_element_located(image_locator))

    # Get the location and size of the div element
    div_location = div_element.location
    div_size = div_element.size

    # Take a screenshot of the entire page
    screenshot_path = f'screenshots/raw/page_{page_num}.png'
    driver.save_screenshot(screenshot_path)

    # Crop the screenshot to include only the specified div
    from PIL import Image

    # Calculate the coordinates for cropping
    left = div_location['x']
    top = div_location['y']
    right = left + div_size['width']
    bottom = top + div_size['height']

    # Open the screenshot and crop it
    img = Image.open(screenshot_path)
    img_cropped = img.crop((left, top, right, bottom))

    # Save the cropped image
    img_cropped.save(f'screenshots/cropped/page_{page_num}.png')


def main():
    # Start Chrome with remote debugging enabled
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--remote-debugging-port=9222')
    chrome_options.add_argument(
        '--headless')  # Optional: Run in headless mode if you don't need a visible browser window

    # Start the Chrome browser
    driver = webdriver.Chrome()

    # Navigate to the website where files are located
    driver.get('https://read.amazon.com')

    login_done = False
    max_time_for_login = 15

    while not login_done:
        time.sleep(1)
        time_left = max_time_for_login - 1

        if time_left <= 0:
            print("Login timed out")
            exit()

        try:
            div_selector = "kg-full-page-img"
            driver.find_element(By.CLASS_NAME, div_selector)
            login_done = True
            print("Login successful")
            time.sleep(10)  # Buffer time to configure things, move to 1st page if incase needed.
        except NoSuchElementException:
            continue

    page_num = 0

    while True:
        # TODO Add random wait time here
        # Find and click the div by its CSS selector
        # Find the element with the specified class
        try:
            element_class = 'footer-label'
            element = driver.find_element(By.CLASS_NAME, element_class)
            page_num = element.text.split(" ")[1]
        except NoSuchElementException:
            print("Unable to get page num")
            page_num += 1

        take_screenshot(driver=driver, page_num=page_num)
        div_selector = 'kr-chevron-container-right'

        try:
            div_element = driver.find_element(By.CLASS_NAME, div_selector)
            div_element.click()
            time.sleep(1)  # Wait 1 sec to imitate reading
        except NoSuchElementException:  # Means there are no more pages
            break

    print("All pages copied as screenshots")


if __name__ == '__main__':
    main()

PreviousDjango Database Introspection NextAdd image over pdf

Last updated 1 year ago

import time from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait def take_screenshot(driver, page_num): div_selector = 'kg-full-page-img' image_locator = (By.CLASS_NAME, div_selector) # Change to the appropriate locator for your image wait = WebDriverWait(driver, 10) div_element = wait.until(EC.presence_of_element_located(image_locator)) # Get the location and size of the div element div_location = div_element.location div_size = div_element.size # Take a screenshot of the entire page screenshot_path = f'screenshots/raw/page_{page_num}.png' driver.save_screenshot(screenshot_path) # Crop the screenshot to include only the specified div from PIL import Image # Calculate the coordinates for cropping left = div_location['x'] top = div_location['y'] right = left + div_size['width'] bottom = top + div_size['height'] # Open the screenshot and crop it img = Image.open(screenshot_path) img_cropped = img.crop((left, top, right, bottom)) # Save the cropped image img_cropped.save(f'screenshots/cropped/page_{page_num}.png') def main(): # Start Chrome with remote debugging enabled chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--remote-debugging-port=9222') chrome_options.add_argument( '--headless') # Optional: Run in headless mode if you don't need a visible browser window # Start the Chrome browser driver = webdriver.Chrome() # Navigate to the website where files are located driver.get('https://read.amazon.com') login_done = False max_time_for_login = 15 while not login_done: time.sleep(1) time_left = max_time_for_login - 1 if time_left <= 0: print("Login timed out") exit() try: div_selector = "kg-full-page-img" driver.find_element(By.CLASS_NAME, div_selector) login_done = True print("Login successful") time.sleep(10) # Buffer time to configure things, move to 1st page if incase needed. except NoSuchElementException: continue page_num = 0 while True: # TODO Add random wait time here # Find and click the div by its CSS selector # Find the element with the specified class try: element_class = 'footer-label' element = driver.find_element(By.CLASS_NAME, element_class) page_num = element.text.split(" ")[1] except NoSuchElementException: print("Unable to get page num") page_num += 1 take_screenshot(driver=driver, page_num=page_num) div_selector = 'kr-chevron-container-right' try: div_element = driver.find_element(By.CLASS_NAME, div_selector) div_element.click() time.sleep(1) # Wait 1 sec to imitate reading except NoSuchElementException: # Means there are no more pages break print("All pages copied as screenshots") if __name__ == '__main__': main()