Kindle Scrape
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def take_screenshot(driver, page_num):
    div_selector = 'kg-full-page-img'
    image_locator = (By.CLASS_NAME, div_selector)  # Change to the appropriate locator for your image
    wait = WebDriverWait(driver, 10)
    div_element = wait.until(EC.presence_of_element_located(image_locator))
    # Get the location and size of the div element
    div_location = div_element.location
    div_size = div_element.size
    # Take a screenshot of the entire page
    screenshot_path = f'screenshots/raw/page_{page_num}.png'
    driver.save_screenshot(screenshot_path)
    # Crop the screenshot to include only the specified div
    from PIL import Image
    # Calculate the coordinates for cropping
    left = div_location['x']
    top = div_location['y']
    right = left + div_size['width']
    bottom = top + div_size['height']
    # Open the screenshot and crop it
    img = Image.open(screenshot_path)
    img_cropped = img.crop((left, top, right, bottom))
    # Save the cropped image
    img_cropped.save(f'screenshots/cropped/page_{page_num}.png')
def main():
    # Start Chrome with remote debugging enabled
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--remote-debugging-port=9222')
    chrome_options.add_argument(
        '--headless')  # Optional: Run in headless mode if you don't need a visible browser window
    # Start the Chrome browser
    driver = webdriver.Chrome()
    # Navigate to the website where files are located
    driver.get('https://read.amazon.com')
    login_done = False
    max_time_for_login = 15
    while not login_done:
        time.sleep(1)
        time_left = max_time_for_login - 1
        if time_left <= 0:
            print("Login timed out")
            exit()
        try:
            div_selector = "kg-full-page-img"
            driver.find_element(By.CLASS_NAME, div_selector)
            login_done = True
            print("Login successful")
            time.sleep(10)  # Buffer time to configure things, move to 1st page if incase needed.
        except NoSuchElementException:
            continue
    page_num = 0
    while True:
        # TODO Add random wait time here
        # Find and click the div by its CSS selector
        # Find the element with the specified class
        try:
            element_class = 'footer-label'
            element = driver.find_element(By.CLASS_NAME, element_class)
            page_num = element.text.split(" ")[1]
        except NoSuchElementException:
            print("Unable to get page num")
            page_num += 1
        take_screenshot(driver=driver, page_num=page_num)
        div_selector = 'kr-chevron-container-right'
        try:
            div_element = driver.find_element(By.CLASS_NAME, div_selector)
            div_element.click()
            time.sleep(1)  # Wait 1 sec to imitate reading
        except NoSuchElementException:  # Means there are no more pages
            break
    print("All pages copied as screenshots")
if __name__ == '__main__':
    main()
Last updated