Kindle Scrape

import time

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


def take_screenshot(driver, page_num):
    div_selector = 'kg-full-page-img'
    image_locator = (By.CLASS_NAME, div_selector)  # Change to the appropriate locator for your image
    wait = WebDriverWait(driver, 10)
    div_element = wait.until(EC.presence_of_element_located(image_locator))

    # Get the location and size of the div element
    div_location = div_element.location
    div_size = div_element.size

    # Take a screenshot of the entire page
    screenshot_path = f'screenshots/raw/page_{page_num}.png'
    driver.save_screenshot(screenshot_path)

    # Crop the screenshot to include only the specified div
    from PIL import Image

    # Calculate the coordinates for cropping
    left = div_location['x']
    top = div_location['y']
    right = left + div_size['width']
    bottom = top + div_size['height']

    # Open the screenshot and crop it
    img = Image.open(screenshot_path)
    img_cropped = img.crop((left, top, right, bottom))

    # Save the cropped image
    img_cropped.save(f'screenshots/cropped/page_{page_num}.png')


def main():
    # Start Chrome with remote debugging enabled
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--remote-debugging-port=9222')
    chrome_options.add_argument(
        '--headless')  # Optional: Run in headless mode if you don't need a visible browser window

    # Start the Chrome browser
    driver = webdriver.Chrome()

    # Navigate to the website where files are located
    driver.get('https://read.amazon.com')

    login_done = False
    max_time_for_login = 15

    while not login_done:
        time.sleep(1)
        time_left = max_time_for_login - 1

        if time_left <= 0:
            print("Login timed out")
            exit()

        try:
            div_selector = "kg-full-page-img"
            driver.find_element(By.CLASS_NAME, div_selector)
            login_done = True
            print("Login successful")
            time.sleep(10)  # Buffer time to configure things, move to 1st page if incase needed.
        except NoSuchElementException:
            continue

    page_num = 0

    while True:
        # TODO Add random wait time here
        # Find and click the div by its CSS selector
        # Find the element with the specified class
        try:
            element_class = 'footer-label'
            element = driver.find_element(By.CLASS_NAME, element_class)
            page_num = element.text.split(" ")[1]
        except NoSuchElementException:
            print("Unable to get page num")
            page_num += 1

        take_screenshot(driver=driver, page_num=page_num)
        div_selector = 'kr-chevron-container-right'

        try:
            div_element = driver.find_element(By.CLASS_NAME, div_selector)
            div_element.click()
            time.sleep(1)  # Wait 1 sec to imitate reading
        except NoSuchElementException:  # Means there are no more pages
            break

    print("All pages copied as screenshots")


if __name__ == '__main__':
    main()

Last updated