-
-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
enhancement: add support for Playwright's
storage_state
parameter (#…
…832) * add support for Playwright `storage_state` * add storage_state param to node_config * add sleep for testing * add sleep in _with_js_support for testing * remove asyncio.sleep() from tests * fix typo in existing example filename; add auth example * add example `authenticated_playwright` * update source link in example to /feed * add `storage_state` to missing graphs
- Loading branch information
1 parent
fbb4252
commit a86e7d6
Showing
16 changed files
with
585 additions
and
329 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
""" | ||
Example leveraging a state file containing session cookies which | ||
might be leveraged to authenticate to a website and scrape protected | ||
content. | ||
""" | ||
|
||
import os | ||
import random | ||
from dotenv import load_dotenv | ||
|
||
# import playwright so we can use it to create the state file | ||
from playwright.async_api import async_playwright | ||
|
||
from scrapegraphai.graphs import OmniScraperGraph | ||
from scrapegraphai.utils import prettify_exec_info | ||
|
||
load_dotenv() | ||
|
||
# ************************************************ | ||
# Leveraging Playwright external to the invocation of the graph to | ||
# login and create the state file | ||
# ************************************************ | ||
|
||
|
||
# note this is just an example and probably won't actually work on | ||
# LinkedIn, the implementation of the login is highly dependent on the website | ||
async def do_login(): | ||
async with async_playwright() as playwright: | ||
browser = await playwright.chromium.launch( | ||
timeout=30000, | ||
headless=False, | ||
slow_mo=random.uniform(500, 1500), | ||
) | ||
page = await browser.new_page() | ||
|
||
# very basic implementation of a login, in reality it may be trickier | ||
await page.goto("https://www.linkedin.com/login") | ||
await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com") | ||
await page.get_by_label("Password").fill("test1234") | ||
await page.get_by_role("button", name="Sign in").click() | ||
await page.wait_for_timeout(3000) | ||
|
||
# assuming a successful login, we save the cookies to a file | ||
await page.context.storage_state(path="./state.json") | ||
|
||
|
||
async def main(): | ||
await do_login() | ||
|
||
# ************************************************ | ||
# Define the configuration for the graph | ||
# ************************************************ | ||
|
||
openai_api_key = os.getenv("OPENAI_APIKEY") | ||
|
||
graph_config = { | ||
"llm": { | ||
"api_key": openai_api_key, | ||
"model": "openai/gpt-4o", | ||
}, | ||
"max_images": 10, | ||
"headless": False, | ||
# provide the path to the state file | ||
"storage_state": "./state.json", | ||
} | ||
|
||
# ************************************************ | ||
# Create the OmniScraperGraph instance and run it | ||
# ************************************************ | ||
|
||
omni_scraper_graph = OmniScraperGraph( | ||
prompt="List me all the projects with their description.", | ||
source="https://www.linkedin.com/feed/", | ||
config=graph_config, | ||
) | ||
|
||
# the storage_state is used to load the cookies from the state file | ||
# so we are authenticated and able to scrape protected content | ||
result = omni_scraper_graph.run() | ||
print(result) | ||
|
||
# ************************************************ | ||
# Get graph execution info | ||
# ************************************************ | ||
|
||
graph_exec_info = omni_scraper_graph.get_execution_info() | ||
print(prettify_exec_info(graph_exec_info)) | ||
|
||
|
||
if __name__ == "__main__": | ||
import asyncio | ||
|
||
asyncio.run(main()) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.