eljur/scraper.py

70 lines
2.5 KiB
Python
Raw Normal View History

2024-01-27 18:30:58 +00:00
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from env_init import LOGIN, PASSWORD, DOMAIN
import logging
from playwright._impl import _api_types
from os import mkdir
# Use sync version of Playwright
p = sync_playwright().start()
# Launch the browser
browser = p.chromium.launch()
logging.debug("browser opened")
# Open a new browser page
page = browser.new_page()
def scrape():
global page
try:
# Open our test file in the opened page
page.goto(f"https://{DOMAIN}.eljur.ru/authorize?return_uri=%2Fjournal-board-action")
try:
# Log In
login_field = page.locator('[type="text"]')
password_field = page.locator('[type="password"]')
login_field.fill(value=LOGIN) # type: ignore
password_field.fill(value=PASSWORD) # type: ignore
submit_button = page.locator('[type="submit"]')
submit_button.click()
except Exception as error:
logging.debug(f"Error while request: {str(error)}")
page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
pass
page.wait_for_url(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
page.wait_for_load_state("domcontentloaded")
if page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}" and page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action":
page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action")
page.wait_for_load_state("domcontentloaded")
page_content = page.content()
new_content = page.locator('[class="board-item active"]').first
try:
new_content.screenshot(path = "data/screenshot.png")
except FileNotFoundError:
mkdir("data")
logging.info("can`t take screenshot")
logging.info("retry")
new_content.screenshot(path = "data/screenshot.png")
except:
logging.error("can`t take screenshot")
# Process extracted content with BeautifulSoup
soup = BeautifulSoup(page_content, features="lxml")
logging.debug("content extracted")
# return '\n'.join(el.strip() for el in str(soup.get_text).split('\n') if el.strip())
return str(soup.get_text)
except _api_types.TimeoutError:
logging.error("connection timed out")
return " "
except Exception as error:
logging.error(f"Error while request: {str(error)}")
return " "