from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright from env_init import LOGIN, PASSWORD, DOMAIN import logging from playwright._impl import _api_types from os import mkdir # Use sync version of Playwright p = sync_playwright().start() # Launch the browser browser = p.chromium.launch() logging.debug("browser opened") # Open a new browser page page = browser.new_page() def scrape(): global page try: # Open our test file in the opened page page.goto(f"https://{DOMAIN}.eljur.ru/authorize?return_uri=%2Fjournal-board-action") try: # Log In login_field = page.locator('[type="text"]') password_field = page.locator('[type="password"]') login_field.fill(value=LOGIN) # type: ignore password_field.fill(value=PASSWORD) # type: ignore submit_button = page.locator('[type="submit"]') submit_button.click() except Exception as error: logging.debug(f"Error while request: {str(error)}") page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}") pass page.wait_for_url(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}") page.wait_for_load_state("domcontentloaded") if page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}" and page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action": page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action") page.wait_for_load_state("domcontentloaded") page_content = page.content() new_content = page.locator('[class="board-item active"]').first try: new_content.screenshot(path = "data/screenshot.png") except FileNotFoundError: mkdir("data") logging.info("can`t take screenshot") logging.info("retry") new_content.screenshot(path = "data/screenshot.png") except: logging.error("can`t take screenshot") # Process extracted content with BeautifulSoup soup = BeautifulSoup(page_content, features="lxml") logging.debug("content extracted") # return '\n'.join(el.strip() for el in str(soup.get_text).split('\n') if el.strip()) return str(soup.get_text) except _api_types.TimeoutError: logging.error("connection timed out") return " " except Exception as error: logging.error(f"Error while request: {str(error)}") return " "