Kindle Scrape
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def take_screenshot(driver, page_num):
div_selector = 'kg-full-page-img'
image_locator = (By.CLASS_NAME, div_selector) # Change to the appropriate locator for your image
wait = WebDriverWait(driver, 10)
div_element = wait.until(EC.presence_of_element_located(image_locator))
# Get the location and size of the div element
div_location = div_element.location
div_size = div_element.size
# Take a screenshot of the entire page
screenshot_path = f'screenshots/raw/page_{page_num}.png'
driver.save_screenshot(screenshot_path)
# Crop the screenshot to include only the specified div
from PIL import Image
# Calculate the coordinates for cropping
left = div_location['x']
top = div_location['y']
right = left + div_size['width']
bottom = top + div_size['height']
# Open the screenshot and crop it
img = Image.open(screenshot_path)
img_cropped = img.crop((left, top, right, bottom))
# Save the cropped image
img_cropped.save(f'screenshots/cropped/page_{page_num}.png')
def main():
# Start Chrome with remote debugging enabled
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--remote-debugging-port=9222')
chrome_options.add_argument(
'--headless') # Optional: Run in headless mode if you don't need a visible browser window
# Start the Chrome browser
driver = webdriver.Chrome()
# Navigate to the website where files are located
driver.get('https://read.amazon.com')
login_done = False
max_time_for_login = 15
while not login_done:
time.sleep(1)
time_left = max_time_for_login - 1
if time_left <= 0:
print("Login timed out")
exit()
try:
div_selector = "kg-full-page-img"
driver.find_element(By.CLASS_NAME, div_selector)
login_done = True
print("Login successful")
time.sleep(10) # Buffer time to configure things, move to 1st page if incase needed.
except NoSuchElementException:
continue
page_num = 0
while True:
# TODO Add random wait time here
# Find and click the div by its CSS selector
# Find the element with the specified class
try:
element_class = 'footer-label'
element = driver.find_element(By.CLASS_NAME, element_class)
page_num = element.text.split(" ")[1]
except NoSuchElementException:
print("Unable to get page num")
page_num += 1
take_screenshot(driver=driver, page_num=page_num)
div_selector = 'kr-chevron-container-right'
try:
div_element = driver.find_element(By.CLASS_NAME, div_selector)
div_element.click()
time.sleep(1) # Wait 1 sec to imitate reading
except NoSuchElementException: # Means there are no more pages
break
print("All pages copied as screenshots")
if __name__ == '__main__':
main()
Last updated