Skip to content

Commit

Permalink
Merge pull request #808 from ScrapeGraphAI/main
Browse files Browse the repository at this point in the history
allignment
  • Loading branch information
VinciGit00 authored Nov 19, 2024
2 parents af901a5 + 98cf5f1 commit 1c090cb
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .github/FUNDING.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

github: ScrapeGraphAI
patreon: # Replace with a single Patreon username
open_collective:
open_collective: https://opencollective.com/scrapegraphai
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06)

## [1.30.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0-beta.4...v1.30.0-beta.5) (2024-11-18)


Expand Down Expand Up @@ -36,6 +38,8 @@

### Features

* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b))

* Turkish language support has been added to README.md ([60f673d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60f673dc39cba70706291e11211b9ad180860e24))

## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04)
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
[![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)

<p align="center">
<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<p align="center">

ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.).

Just say which information you want to extract and the library will do it for you!
Expand Down
27 changes: 0 additions & 27 deletions extract_data.py

This file was deleted.

28 changes: 0 additions & 28 deletions extracted_data.py

This file was deleted.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
name = "scrapegraphai"



version = "1.30.0b5"





description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
authors = [
{ name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
Expand Down
23 changes: 14 additions & 9 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
chromiumloader module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional
from langchain_community.document_loaders.base import BaseLoader
Expand All @@ -12,15 +9,16 @@
logger = get_logger("web-loader")

class ChromiumLoader(BaseLoader):
"""scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection
"""Scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection.
Attributes:
backend: The web driver backend library; defaults to 'playwright'.
browser_config: A dictionary containing additional browser kwargs.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy settings; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Flag to determine if JS rendering is required.
"""

RETRY_LIMIT = 3
Expand All @@ -34,15 +32,17 @@ def __init__(
headless: bool = True,
proxy: Optional[Proxy] = None,
load_state: str = "domcontentloaded",
requires_js_support: bool = False,
**kwargs: Any,
):
"""Initialize the loader with a list of URL paths.
Args:
backend: The web driver backend library; defaults to 'playwright'.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy information; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Whether to use JS rendering for scraping.
kwargs: A dictionary containing additional browser kwargs.
Raises:
Expand All @@ -61,6 +61,7 @@ def __init__(
self.proxy = parse_or_search_proxy(proxy) if proxy else None
self.urls = urls
self.load_state = load_state
self.requires_js_support = requires_js_support

async def ascrape_undetected_chromedriver(self, url: str) -> str:
"""
Expand Down Expand Up @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
Yields:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

for url in self.urls:
html_content = asyncio.run(scraping_fn(url))
Expand All @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

tasks = [scraping_fn(url) for url in self.urls]
results = await asyncio.gather(*tasks)
Expand Down
10 changes: 6 additions & 4 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source):

compressed_document = self.load_file_content(source, input_type)

return self.update_state(state, compressed_document)

# return self.update_state(state, compressed_document)
state.update({self.output[0]: compressed_document})
return state
def load_file_content(self, source, input_type):
"""
Loads the content of a file based on its input type.
Expand Down Expand Up @@ -230,8 +231,9 @@ def handle_local_source(self, state, source):
Document(page_content=parsed_content, metadata={"source": "local_dir"})
]

return self.update_state(state, compressed_document)

# return self.update_state(state, compressed_document)
state.update({self.output[0]: compressed_document})
return state
def handle_web_source(self, state, source):
"""
Handles the web source by fetching HTML content from a URL,
Expand Down
15 changes: 3 additions & 12 deletions scrapegraphai/nodes/generate_answer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,26 +138,17 @@ def invoke_with_timeout(chain, inputs, timeout):
partial_variables={"context": doc, "format_instructions": format_instructions}
)
chain = prompt | self.llm_model

try:
raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
except Timeout:
state.update({self.output[0]: {"error": "Response timeout exceeded"}})
return state

if output_parser:
try:
answer = output_parser.parse(raw_response.content)
except JSONDecodeError:
lines = raw_response.split('\n')
if lines[0].strip().startswith('```'):
lines = lines[1:]
if lines[-1].strip().endswith('```'):
lines = lines[:-1]
cleaned_response = '\n'.join(lines)
answer = output_parser.parse(cleaned_response)
else:
answer = raw_response.content
chain = chain | output_parser

answer = chain.invoke({"question": user_prompt})
state.update({self.output[0]: answer})
return state

Expand Down

0 comments on commit 1c090cb

Please sign in to comment.