diff --git a/examples/scrapegraph-api/smart_scraper_api.py b/examples/scrapegraph-api/smart_scraper_api.py new file mode 100644 index 00000000..8a292ee9 --- /dev/null +++ b/examples/scrapegraph-api/smart_scraper_api.py @@ -0,0 +1,44 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "model": "scrapegraphai/smart-scraper", + "api_key": os.getenv("SCRAPEGRAPH_API_KEY") + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="Extract me all the articles", + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/pyproject.toml b/pyproject.toml index 20ed05f3..54ee76b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dependencies = [ "transformers>=4.44.2", "googlesearch-python>=1.2.5", "simpleeval>=1.0.0", - "async_timeout>=4.0.3" + "async_timeout>=4.0.3", + "scrapegraph-py>=0.0.4" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 00000000..7407894f --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,565 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false + +-e file:. +aiofiles==24.1.0 + # via burr +aiohappyeyeballs==2.3.5 + # via aiohttp +aiohttp==3.10.3 + # via langchain + # via langchain-community +aiosignal==1.3.1 + # via aiohttp +alabaster==0.7.16 + # via sphinx +altair==5.4.0 + # via streamlit +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via httpx + # via openai + # via starlette +astroid==3.2.4 + # via pylint +async-timeout==4.0.3 + # via aiohttp + # via langchain + # via scrapegraphai +attrs==24.2.0 + # via aiohttp + # via jsonschema + # via referencing +babel==2.16.0 + # via sphinx +beautifulsoup4==4.12.3 + # via furo + # via googlesearch-python + # via scrapegraphai +blinker==1.8.2 + # via streamlit +boto3==1.34.158 + # via langchain-aws +botocore==1.34.158 + # via boto3 + # via s3transfer +burr==0.22.1 + # via scrapegraphai +cachetools==5.4.0 + # via google-auth + # via streamlit +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via burr + # via streamlit + # via uvicorn +coloredlogs==15.0.1 + # via onnxruntime +contourpy==1.2.1 + # via matplotlib +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community +dill==0.3.8 + # via multiprocess + # via pylint +distro==1.9.0 + # via openai +docutils==0.19 + # via sphinx +exceptiongroup==1.2.2 + # via anyio + # via pytest +fastapi==0.112.0 + # via burr +fastapi-pagination==0.12.26 + # via burr +fastembed==0.3.6 + # via scrapegraphai +filelock==3.15.4 + # via huggingface-hub + # via transformers +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.53.1 + # via matplotlib +free-proxy==1.1.1 + # via scrapegraphai +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub +furo==2024.5.6 + # via scrapegraphai +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via streamlit +google-ai-generativelanguage==0.6.6 + # via google-generativeai +google-api-core==2.19.1 + # via google-ai-generativelanguage + # via google-api-python-client + # via google-generativeai +google-api-python-client==2.140.0 + # via google-generativeai +google-auth==2.33.0 + # via google-ai-generativelanguage + # via google-api-core + # via google-api-python-client + # via google-auth-httplib2 + # via google-generativeai +google-auth-httplib2==0.2.0 + # via google-api-python-client +google-generativeai==0.7.2 + # via langchain-google-genai +googleapis-common-protos==1.63.2 + # via google-api-core + # via grpcio-status +googlesearch-python==1.2.5 + # via scrapegraphai +graphviz==0.20.3 + # via burr +greenlet==3.0.3 + # via playwright +grpcio==1.65.4 + # via google-api-core + # via grpcio-status + # via grpcio-tools + # via qdrant-client +grpcio-status==1.62.3 + # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client +h11==0.14.0 + # via httpcore + # via uvicorn +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 +html2text==2024.2.26 + # via scrapegraphai +httpcore==1.0.5 + # via httpx +httplib2==0.22.0 + # via google-api-python-client + # via google-auth-httplib2 +httpx==0.27.0 + # via langchain-mistralai + # via langsmith + # via ollama + # via openai + # via qdrant-client +httpx-sse==0.4.0 + # via langchain-mistralai +huggingface-hub==0.24.5 + # via fastembed + # via tokenizers + # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 +idna==3.7 + # via anyio + # via httpx + # via requests + # via yarl +imagesize==1.4.1 + # via sphinx +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via pylint +jinja2==3.1.4 + # via altair + # via burr + # via pydeck + # via sphinx +jiter==0.5.0 + # via openai +jmespath==1.0.1 + # via boto3 + # via botocore +jsonpatch==1.33 + # via langchain-core +jsonpointer==3.0.0 + # via jsonpatch +jsonschema==4.23.0 + # via altair + # via mistral-common +jsonschema-specifications==2023.12.1 + # via jsonschema +kiwisolver==1.4.5 + # via matplotlib +langchain==0.3.0 + # via langchain-community + # via scrapegraphai +langchain-aws==0.2.0 + # via scrapegraphai +langchain-community==0.3.0 + # via scrapegraphai +langchain-core==0.3.1 + # via langchain + # via langchain-aws + # via langchain-community + # via langchain-google-genai + # via langchain-mistralai + # via langchain-ollama + # via langchain-openai + # via langchain-text-splitters +langchain-google-genai==2.0.0 + # via scrapegraphai +langchain-mistralai==0.2.0 + # via scrapegraphai +langchain-ollama==0.2.0 + # via scrapegraphai +langchain-openai==0.2.0 + # via scrapegraphai +langchain-text-splitters==0.3.0 + # via langchain +langsmith==0.1.121 + # via langchain + # via langchain-community + # via langchain-core +loguru==0.7.2 + # via burr + # via fastembed +lxml==5.3.0 + # via free-proxy +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.21.3 + # via dataclasses-json +matplotlib==3.9.1.post1 + # via burr +mccabe==0.7.0 + # via pylint +mdurl==0.1.2 + # via markdown-it-py +minify-html==0.15.0 + # via scrapegraphai +mistral-common==1.4.1 + # via scrapegraphai +mmh3==4.1.0 + # via fastembed +mpire==2.10.2 + # via semchunk +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +multiprocess==0.70.16 + # via mpire +mypy-extensions==1.0.0 + # via typing-inspect +narwhals==1.3.0 + # via altair +numpy==1.26.4 + # via contourpy + # via fastembed + # via langchain + # via langchain-aws + # via langchain-community + # via matplotlib + # via onnx + # via onnxruntime + # via opencv-python-headless + # via pandas + # via pyarrow + # via pydeck + # via qdrant-client + # via sf-hamilton + # via streamlit + # via transformers +ollama==0.3.2 + # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed +openai==1.40.3 + # via burr + # via langchain-openai +opencv-python-headless==4.10.0.84 + # via mistral-common +orjson==3.10.7 + # via langsmith +packaging==24.1 + # via altair + # via huggingface-hub + # via langchain-core + # via marshmallow + # via matplotlib + # via onnxruntime + # via pytest + # via sphinx + # via streamlit + # via transformers +pandas==2.2.2 + # via scrapegraphai + # via sf-hamilton + # via streamlit +pillow==10.4.0 + # via fastembed + # via matplotlib + # via mistral-common + # via streamlit +platformdirs==4.2.2 + # via pylint +playwright==1.45.1 + # via scrapegraphai + # via undetected-playwright +pluggy==1.5.0 + # via pytest +portalocker==2.10.1 + # via qdrant-client +proto-plus==1.24.0 + # via google-ai-generativelanguage + # via google-api-core +protobuf==4.25.4 + # via google-ai-generativelanguage + # via google-api-core + # via google-generativeai + # via googleapis-common-protos + # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime + # via proto-plus + # via streamlit +pyarrow==17.0.0 + # via streamlit +pyasn1==0.6.0 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.0 + # via google-auth +pydantic==2.10.1 + # via burr + # via fastapi + # via fastapi-pagination + # via google-generativeai + # via langchain + # via langchain-aws + # via langchain-core + # via langchain-google-genai + # via langchain-mistralai + # via langsmith + # via mistral-common + # via openai + # via pydantic-settings + # via qdrant-client + # via scrapegraph-py +pydantic-core==2.27.1 + # via pydantic +pydantic-settings==2.5.2 + # via langchain-community +pydeck==0.9.1 + # via streamlit +pyee==11.1.0 + # via playwright +pygments==2.18.0 + # via furo + # via mpire + # via rich + # via sphinx +pylint==3.2.6 +pyparsing==3.1.2 + # via httplib2 + # via matplotlib +pystemmer==2.2.0.1 + # via fastembed +pytest==8.0.0 + # via pytest-mock +pytest-mock==3.14.0 +python-dateutil==2.9.0.post0 + # via botocore + # via matplotlib + # via pandas +python-dotenv==1.0.1 + # via pydantic-settings + # via scrapegraph-py + # via scrapegraphai +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via transformers +qdrant-client==1.11.3 + # via scrapegraphai +referencing==0.35.1 + # via jsonschema + # via jsonschema-specifications +regex==2024.7.24 + # via tiktoken + # via transformers +requests==2.32.3 + # via burr + # via fastembed + # via free-proxy + # via google-api-core + # via googlesearch-python + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via mistral-common + # via scrapegraph-py + # via sphinx + # via streamlit + # via tiktoken + # via transformers +rich==13.7.1 + # via streamlit +rpds-py==0.20.0 + # via jsonschema + # via referencing +rsa==4.9 + # via google-auth +s3transfer==0.10.2 + # via boto3 +safetensors==0.4.5 + # via transformers +scrapegraph-py==0.0.3 + # via scrapegraphai +semchunk==2.2.0 + # via scrapegraphai +sentencepiece==0.2.0 + # via mistral-common +setuptools==75.1.0 + # via grpcio-tools +sf-hamilton==1.73.1 + # via burr +simpleeval==1.0.0 + # via scrapegraphai +six==1.16.0 + # via python-dateutil +smmap==5.0.1 + # via gitdb +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +snowballstemmer==2.2.0 + # via fastembed + # via sphinx +soupsieve==2.5 + # via beautifulsoup4 +sphinx==6.0.0 + # via furo + # via scrapegraphai + # via sphinx-basic-ng +sphinx-basic-ng==1.0.0b2 + # via furo +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +sqlalchemy==2.0.32 + # via langchain + # via langchain-community +starlette==0.37.2 + # via fastapi +streamlit==1.37.1 + # via burr +sympy==1.13.3 + # via onnxruntime +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core + # via streamlit +tiktoken==0.7.0 + # via langchain-openai + # via mistral-common + # via scrapegraphai +tokenizers==0.19.1 + # via fastembed + # via langchain-mistralai + # via transformers +toml==0.10.2 + # via streamlit +tomli==2.1.0 + # via pylint + # via pytest +tomlkit==0.13.0 + # via pylint +tornado==6.4.1 + # via streamlit +tqdm==4.66.5 + # via fastembed + # via google-generativeai + # via huggingface-hub + # via mpire + # via openai + # via scrapegraphai + # via semchunk + # via transformers +transformers==4.44.2 + # via scrapegraphai +typing-extensions==4.12.2 + # via altair + # via anyio + # via astroid + # via fastapi + # via fastapi-pagination + # via google-generativeai + # via huggingface-hub + # via langchain-core + # via mistral-common + # via openai + # via pydantic + # via pydantic-core + # via pyee + # via sf-hamilton + # via sqlalchemy + # via streamlit + # via typing-inspect + # via uvicorn +typing-inspect==0.9.0 + # via dataclasses-json + # via sf-hamilton +tzdata==2024.1 + # via pandas +undetected-playwright==0.3.0 + # via scrapegraphai +uritemplate==4.1.1 + # via google-api-python-client +urllib3==1.26.19 + # via botocore + # via qdrant-client + # via requests +uvicorn==0.30.5 + # via burr +yarl==1.9.4 + # via aiohttp diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 00000000..fd291ce8 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,403 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false + +-e file:. +aiohttp==3.9.5 + # via langchain + # via langchain-community +aiosignal==1.3.1 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via httpx + # via openai +async-timeout==4.0.3 + # via aiohttp + # via langchain + # via scrapegraphai +attrs==23.2.0 + # via aiohttp + # via jsonschema + # via referencing +beautifulsoup4==4.12.3 + # via googlesearch-python + # via scrapegraphai +boto3==1.34.146 + # via langchain-aws +botocore==1.34.146 + # via boto3 + # via s3transfer +cachetools==5.4.0 + # via google-auth +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests +charset-normalizer==3.3.2 + # via requests +coloredlogs==15.0.1 + # via onnxruntime +dataclasses-json==0.6.7 + # via langchain-community +dill==0.3.8 + # via multiprocess +distro==1.9.0 + # via openai +exceptiongroup==1.2.2 + # via anyio +fastembed==0.3.6 + # via scrapegraphai +filelock==3.15.4 + # via huggingface-hub + # via transformers +flatbuffers==24.3.25 + # via onnxruntime +free-proxy==1.1.1 + # via scrapegraphai +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub +google-ai-generativelanguage==0.6.6 + # via google-generativeai +google-api-core==2.19.1 + # via google-ai-generativelanguage + # via google-api-python-client + # via google-generativeai +google-api-python-client==2.137.0 + # via google-generativeai +google-auth==2.32.0 + # via google-ai-generativelanguage + # via google-api-core + # via google-api-python-client + # via google-auth-httplib2 + # via google-generativeai +google-auth-httplib2==0.2.0 + # via google-api-python-client +google-generativeai==0.7.2 + # via langchain-google-genai +googleapis-common-protos==1.63.2 + # via google-api-core + # via grpcio-status +googlesearch-python==1.2.5 + # via scrapegraphai +greenlet==3.0.3 + # via playwright +grpcio==1.65.1 + # via google-api-core + # via grpcio-status + # via grpcio-tools + # via qdrant-client +grpcio-status==1.62.2 + # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client +h11==0.14.0 + # via httpcore +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 +html2text==2024.2.26 + # via scrapegraphai +httpcore==1.0.5 + # via httpx +httplib2==0.22.0 + # via google-api-python-client + # via google-auth-httplib2 +httpx==0.27.0 + # via langchain-mistralai + # via langsmith + # via ollama + # via openai + # via qdrant-client +httpx-sse==0.4.0 + # via langchain-mistralai +huggingface-hub==0.24.1 + # via fastembed + # via tokenizers + # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 +idna==3.7 + # via anyio + # via httpx + # via requests + # via yarl +jiter==0.5.0 + # via openai +jmespath==1.0.1 + # via boto3 + # via botocore +jsonpatch==1.33 + # via langchain-core +jsonpointer==3.0.0 + # via jsonpatch +jsonschema==4.23.0 + # via mistral-common +jsonschema-specifications==2023.12.1 + # via jsonschema +langchain==0.3.0 + # via langchain-community + # via scrapegraphai +langchain-aws==0.2.0 + # via scrapegraphai +langchain-community==0.3.0 + # via scrapegraphai +langchain-core==0.3.1 + # via langchain + # via langchain-aws + # via langchain-community + # via langchain-google-genai + # via langchain-mistralai + # via langchain-ollama + # via langchain-openai + # via langchain-text-splitters +langchain-google-genai==2.0.0 + # via scrapegraphai +langchain-mistralai==0.2.0 + # via scrapegraphai +langchain-ollama==0.2.0 + # via scrapegraphai +langchain-openai==0.2.0 + # via scrapegraphai +langchain-text-splitters==0.3.0 + # via langchain +langsmith==0.1.121 + # via langchain + # via langchain-community + # via langchain-core +loguru==0.7.2 + # via fastembed +lxml==5.2.2 + # via free-proxy +marshmallow==3.21.3 + # via dataclasses-json +minify-html==0.15.0 + # via scrapegraphai +mistral-common==1.4.1 + # via scrapegraphai +mmh3==4.1.0 + # via fastembed +mpire==2.10.2 + # via semchunk +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +multiprocess==0.70.16 + # via mpire +mypy-extensions==1.0.0 + # via typing-inspect +numpy==1.26.4 + # via fastembed + # via langchain + # via langchain-aws + # via langchain-community + # via onnx + # via onnxruntime + # via opencv-python-headless + # via pandas + # via qdrant-client + # via transformers +ollama==0.3.2 + # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed +openai==1.41.0 + # via langchain-openai +opencv-python-headless==4.10.0.84 + # via mistral-common +orjson==3.10.6 + # via langsmith +packaging==24.1 + # via huggingface-hub + # via langchain-core + # via marshmallow + # via onnxruntime + # via transformers +pandas==2.2.2 + # via scrapegraphai +pillow==10.4.0 + # via fastembed + # via mistral-common +playwright==1.45.1 + # via scrapegraphai + # via undetected-playwright +portalocker==2.10.1 + # via qdrant-client +proto-plus==1.24.0 + # via google-ai-generativelanguage + # via google-api-core +protobuf==4.25.3 + # via google-ai-generativelanguage + # via google-api-core + # via google-generativeai + # via googleapis-common-protos + # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime + # via proto-plus +pyasn1==0.6.0 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.0 + # via google-auth +pydantic==2.10.1 + # via google-generativeai + # via langchain + # via langchain-aws + # via langchain-core + # via langchain-google-genai + # via langchain-mistralai + # via langsmith + # via mistral-common + # via openai + # via pydantic-settings + # via qdrant-client + # via scrapegraph-py +pydantic-core==2.27.1 + # via pydantic +pydantic-settings==2.5.2 + # via langchain-community +pyee==11.1.0 + # via playwright +pygments==2.18.0 + # via mpire +pyparsing==3.1.2 + # via httplib2 +pystemmer==2.2.0.1 + # via fastembed +python-dateutil==2.9.0.post0 + # via botocore + # via pandas +python-dotenv==1.0.1 + # via pydantic-settings + # via scrapegraph-py + # via scrapegraphai +pytz==2024.1 + # via pandas +pyyaml==6.0.1 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via transformers +qdrant-client==1.11.3 + # via scrapegraphai +referencing==0.35.1 + # via jsonschema + # via jsonschema-specifications +regex==2024.5.15 + # via tiktoken + # via transformers +requests==2.32.3 + # via fastembed + # via free-proxy + # via google-api-core + # via googlesearch-python + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via mistral-common + # via scrapegraph-py + # via tiktoken + # via transformers +rpds-py==0.20.0 + # via jsonschema + # via referencing +rsa==4.9 + # via google-auth +s3transfer==0.10.2 + # via boto3 +safetensors==0.4.5 + # via transformers +scrapegraph-py==0.0.3 + # via scrapegraphai +semchunk==2.2.0 + # via scrapegraphai +sentencepiece==0.2.0 + # via mistral-common +setuptools==75.1.0 + # via grpcio-tools +simpleeval==1.0.0 + # via scrapegraphai +six==1.16.0 + # via python-dateutil +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +snowballstemmer==2.2.0 + # via fastembed +soupsieve==2.5 + # via beautifulsoup4 +sqlalchemy==2.0.31 + # via langchain + # via langchain-community +sympy==1.13.3 + # via onnxruntime +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core +tiktoken==0.7.0 + # via langchain-openai + # via mistral-common + # via scrapegraphai +tokenizers==0.19.1 + # via fastembed + # via langchain-mistralai + # via transformers +tqdm==4.66.4 + # via fastembed + # via google-generativeai + # via huggingface-hub + # via mpire + # via openai + # via scrapegraphai + # via semchunk + # via transformers +transformers==4.44.2 + # via scrapegraphai +typing-extensions==4.12.2 + # via anyio + # via google-generativeai + # via huggingface-hub + # via langchain-core + # via mistral-common + # via openai + # via pydantic + # via pydantic-core + # via pyee + # via sqlalchemy + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +tzdata==2024.1 + # via pandas +undetected-playwright==0.3.0 + # via scrapegraphai +uritemplate==4.1.1 + # via google-api-python-client +urllib3==1.26.19 + # via botocore + # via qdrant-client + # via requests +yarl==1.9.4 + # via aiohttp diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 594420f5..340f69bb 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -13,6 +13,7 @@ ConditionalNode ) from ..prompts import REGEN_ADDITIONAL_INFO +from scrapegraph_py import SyncClient class SmartScraperGraph(AbstractGraph): """ @@ -59,6 +60,15 @@ def _create_graph(self) -> BaseGraph: Returns: BaseGraph: A graph instance representing the web scraping workflow. """ + if self.llm_model == "scrapegraphai/smart-scraper": + + sgai_client = SyncClient(api_key=self.config.get("api_key")) + + response = sgai_client.smartscraper( + website_url=self.source, + user_prompt=self.prompt + ) + return response fetch_node = FetchNode( input="url| local_dir",