70 lines
2.5 KiB
Python
70 lines
2.5 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
from playwright.sync_api import sync_playwright
|
||
|
from env_init import LOGIN, PASSWORD, DOMAIN
|
||
|
import logging
|
||
|
from playwright._impl import _api_types
|
||
|
from os import mkdir
|
||
|
|
||
|
|
||
|
# Use sync version of Playwright
|
||
|
p = sync_playwright().start()
|
||
|
# Launch the browser
|
||
|
browser = p.chromium.launch()
|
||
|
logging.debug("browser opened")
|
||
|
|
||
|
# Open a new browser page
|
||
|
page = browser.new_page()
|
||
|
|
||
|
|
||
|
def scrape():
|
||
|
global page
|
||
|
try:
|
||
|
# Open our test file in the opened page
|
||
|
page.goto(f"https://{DOMAIN}.eljur.ru/authorize?return_uri=%2Fjournal-board-action")
|
||
|
try:
|
||
|
# Log In
|
||
|
login_field = page.locator('[type="text"]')
|
||
|
password_field = page.locator('[type="password"]')
|
||
|
|
||
|
login_field.fill(value=LOGIN) # type: ignore
|
||
|
password_field.fill(value=PASSWORD) # type: ignore
|
||
|
|
||
|
submit_button = page.locator('[type="submit"]')
|
||
|
submit_button.click()
|
||
|
except Exception as error:
|
||
|
logging.debug(f"Error while request: {str(error)}")
|
||
|
page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
|
||
|
pass
|
||
|
|
||
|
page.wait_for_url(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
|
||
|
page.wait_for_load_state("domcontentloaded")
|
||
|
if page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}" and page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action":
|
||
|
page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action")
|
||
|
page.wait_for_load_state("domcontentloaded")
|
||
|
|
||
|
page_content = page.content()
|
||
|
|
||
|
new_content = page.locator('[class="board-item active"]').first
|
||
|
try:
|
||
|
new_content.screenshot(path = "data/screenshot.png")
|
||
|
except FileNotFoundError:
|
||
|
mkdir("data")
|
||
|
logging.info("can`t take screenshot")
|
||
|
logging.info("retry")
|
||
|
new_content.screenshot(path = "data/screenshot.png")
|
||
|
except:
|
||
|
logging.error("can`t take screenshot")
|
||
|
|
||
|
# Process extracted content with BeautifulSoup
|
||
|
soup = BeautifulSoup(page_content, features="lxml")
|
||
|
|
||
|
logging.debug("content extracted")
|
||
|
# return '\n'.join(el.strip() for el in str(soup.get_text).split('\n') if el.strip())
|
||
|
return str(soup.get_text)
|
||
|
|
||
|
except _api_types.TimeoutError:
|
||
|
logging.error("connection timed out")
|
||
|
return " "
|
||
|
except Exception as error:
|
||
|
logging.error(f"Error while request: {str(error)}")
|
||
|
return " "
|