eljur/scraper.py

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from env_init import LOGIN, PASSWORD, DOMAIN
import logging
from playwright._impl import _api_types
from os import mkdir


# Use sync version of Playwright
p = sync_playwright().start()
# Launch the browser
browser = p.chromium.launch()
logging.debug("browser opened")

# Open a new browser page
page = browser.new_page()


def scrape():
    global page
    try:
        # Open our test file in the opened page
        page.goto(f"https://{DOMAIN}.eljur.ru/authorize?return_uri=%2Fjournal-board-action")
        try:
            # Log In
            login_field = page.locator('[type="text"]')
            password_field = page.locator('[type="password"]')

            login_field.fill(value=LOGIN) # type: ignore
            password_field.fill(value=PASSWORD) # type: ignore

            submit_button = page.locator('[type="submit"]')
            submit_button.click()
        except Exception as error:
            logging.debug(f"Error while request: {str(error)}")
            page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
            pass
    
        page.wait_for_url(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")
        page.wait_for_load_state("domcontentloaded")
        if page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}" and page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action":
            page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action")
            page.wait_for_load_state("domcontentloaded")
        
        page_content = page.content()

        new_content = page.locator('[class="board-item active"]').first
        try:
            new_content.screenshot(path = "data/screenshot.png")
        except FileNotFoundError:
            mkdir("data")
            logging.info("can`t take screenshot")
            logging.info("retry")
            new_content.screenshot(path = "data/screenshot.png")
        except:
            logging.error("can`t take screenshot")

        # Process extracted content with BeautifulSoup
        soup = BeautifulSoup(page_content, features="lxml")

        logging.debug("content extracted")
#        return '\n'.join(el.strip() for el in str(soup.get_text).split('\n') if el.strip())
        return str(soup.get_text)
    
    except _api_types.TimeoutError:
        logging.error("connection timed out")
        return " "
    except Exception as error:
        logging.error(f"Error while request: {str(error)}")
        return " "
python 2024-01-27 18:30:58 +00:00			`from bs4 import BeautifulSoup`
			`from playwright.sync_api import sync_playwright`
			`from env_init import LOGIN, PASSWORD, DOMAIN`
			`import logging`
			`from playwright._impl import _api_types`
			`from os import mkdir`


			`# Use sync version of Playwright`
			`p = sync_playwright().start()`
			`# Launch the browser`
			`browser = p.chromium.launch()`
			`logging.debug("browser opened")`

			`# Open a new browser page`
			`page = browser.new_page()`


			`def scrape():`
			`global page`
			`try:`
			`# Open our test file in the opened page`
			`page.goto(f"https://{DOMAIN}.eljur.ru/authorize?return_uri=%2Fjournal-board-action")`
			`try:`
			`# Log In`
			`login_field = page.locator('[type="text"]')`
			`password_field = page.locator('[type="password"]')`

			`login_field.fill(value=LOGIN) # type: ignore`
			`password_field.fill(value=PASSWORD) # type: ignore`

			`submit_button = page.locator('[type="submit"]')`
			`submit_button.click()`
			`except Exception as error:`
			`logging.debug(f"Error while request: {str(error)}")`
			`page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")`
			`pass`

			`page.wait_for_url(f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}")`
			`page.wait_for_load_state("domcontentloaded")`
			`if page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action?user={LOGIN}&domain={DOMAIN}" and page.url != f"https://{DOMAIN}.eljur.ru/journal-board-action":`
			`page.goto(f"https://{DOMAIN}.eljur.ru/journal-board-action")`
			`page.wait_for_load_state("domcontentloaded")`

			`page_content = page.content()`

			`new_content = page.locator('[class="board-item active"]').first`
			`try:`
			`new_content.screenshot(path = "data/screenshot.png")`
			`except FileNotFoundError:`
			`mkdir("data")`
			logging.info("can`t take screenshot")
			`logging.info("retry")`
			`new_content.screenshot(path = "data/screenshot.png")`
			`except:`
			logging.error("can`t take screenshot")

			`# Process extracted content with BeautifulSoup`
			`soup = BeautifulSoup(page_content, features="lxml")`

			`logging.debug("content extracted")`
			`# return '\n'.join(el.strip() for el in str(soup.get_text).split('\n') if el.strip())`
			`return str(soup.get_text)`

			`except _api_types.TimeoutError:`
			`logging.error("connection timed out")`
			`return " "`
			`except Exception as error:`
			`logging.error(f"Error while request: {str(error)}")`
			`return " "`