diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 4c4dbecc..0a2af007 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -2,7 +2,7 @@ github: ScrapeGraphAI patreon: # Replace with a single Patreon username -open_collective: +open_collective: https://opencollective.com/scrapegraphai ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry diff --git a/CHANGELOG.md b/CHANGELOG.md index fb9ab58a..436433a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06) + ## [1.30.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0-beta.4...v1.30.0-beta.5) (2024-11-18) @@ -36,6 +38,8 @@ ### Features +* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b)) + * Turkish language support has been added to README.md ([60f673d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60f673dc39cba70706291e11211b9ad180860e24)) ## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04) diff --git a/README.md b/README.md index d881cd41..37ae7b03 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT) [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) +

+VinciGit00%2FScrapegraph-ai | Trendshift +

+ ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.). Just say which information you want to extract and the library will do it for you! diff --git a/extract_data.py b/extract_data.py deleted file mode 100644 index df3babc2..00000000 --- a/extract_data.py +++ /dev/null @@ -1,27 +0,0 @@ -def extract_data(html: str) -> dict: - from bs4 import BeautifulSoup - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') - - # Initialize an empty list to hold project data - projects = [] - - # Find all project entries in the HTML - project_entries = soup.find_all('div', class_='grid-item') - - # Iterate over each project entry to extract title and description - for entry in project_entries: - # Extract the title from the h4 element - title = entry.find('h4', class_='card-title').get_text(strip=True) - # Extract the description from the p element - description = entry.find('p', class_='card-text').get_text(strip=True) - - # Append the extracted data as a dictionary to the projects list - projects.append({ - 'title': title, - 'description': description - }) - - # Return the structured data as a dictionary matching the desired JSON schema - return {'projects': projects} \ No newline at end of file diff --git a/extracted_data.py b/extracted_data.py deleted file mode 100644 index 45da5e49..00000000 --- a/extracted_data.py +++ /dev/null @@ -1,28 +0,0 @@ -def extract_data(html: str) -> dict: - from bs4 import BeautifulSoup - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') - - # Initialize an empty list to hold project data - projects = [] - - # Find all project entries in the HTML - project_entries = soup.find_all('div', class_='grid-item') - - # Iterate over each project entry to extract title and description - for entry in project_entries: - # Extract the title from the card-title class - title = entry.find('h4', class_='card-title').get_text(strip=True) - - # Extract the description from the card-text class - description = entry.find('p', class_='card-text').get_text(strip=True) - - # Append the extracted data as a dictionary to the projects list - projects.append({ - 'title': title, - 'description': description - }) - - # Return the structured data as a dictionary matching the desired JSON schema - return {'projects': projects} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3211252f..9cbe60f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,11 +2,13 @@ name = "scrapegraphai" + version = "1.30.0b5" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 48058436..cf784e95 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,6 +1,3 @@ -""" -chromiumloader module -""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional from langchain_community.document_loaders.base import BaseLoader @@ -12,15 +9,16 @@ logger = get_logger("web-loader") class ChromiumLoader(BaseLoader): - """scrapes HTML pages from URLs using a (headless) instance of the - Chromium web driver with proxy protection + """Scrapes HTML pages from URLs using a (headless) instance of the + Chromium web driver with proxy protection. Attributes: backend: The web driver backend library; defaults to 'playwright'. browser_config: A dictionary containing additional browser kwargs. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy settings; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Flag to determine if JS rendering is required. """ RETRY_LIMIT = 3 @@ -34,15 +32,17 @@ def __init__( headless: bool = True, proxy: Optional[Proxy] = None, load_state: str = "domcontentloaded", + requires_js_support: bool = False, **kwargs: Any, ): """Initialize the loader with a list of URL paths. Args: backend: The web driver backend library; defaults to 'playwright'. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy information; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Whether to use JS rendering for scraping. kwargs: A dictionary containing additional browser kwargs. Raises: @@ -61,6 +61,7 @@ def __init__( self.proxy = parse_or_search_proxy(proxy) if proxy else None self.urls = urls self.load_state = load_state + self.requires_js_support = requires_js_support async def ascrape_undetected_chromedriver(self, url: str) -> str: """ @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]: Yields: Document: The scraped content encapsulated within a Document object. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) for url in self.urls: html_content = asyncio.run(scraping_fn(url)) @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]: Document: A Document object containing the scraped content, along with its source URL as metadata. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) tasks = [scraping_fn(url) for url in self.urls] results = await asyncio.gather(*tasks) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 06842ca4..55f05ab6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source): compressed_document = self.load_file_content(source, input_type) - return self.update_state(state, compressed_document) - + # return self.update_state(state, compressed_document) + state.update({self.output[0]: compressed_document}) + return state def load_file_content(self, source, input_type): """ Loads the content of a file based on its input type. @@ -230,8 +231,9 @@ def handle_local_source(self, state, source): Document(page_content=parsed_content, metadata={"source": "local_dir"}) ] - return self.update_state(state, compressed_document) - + # return self.update_state(state, compressed_document) + state.update({self.output[0]: compressed_document}) + return state def handle_web_source(self, state, source): """ Handles the web source by fetching HTML content from a URL, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 01d834b8..758cdaf1 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -138,6 +138,7 @@ def invoke_with_timeout(chain, inputs, timeout): partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model + try: raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout) except Timeout: @@ -145,19 +146,9 @@ def invoke_with_timeout(chain, inputs, timeout): return state if output_parser: - try: - answer = output_parser.parse(raw_response.content) - except JSONDecodeError: - lines = raw_response.split('\n') - if lines[0].strip().startswith('```'): - lines = lines[1:] - if lines[-1].strip().endswith('```'): - lines = lines[:-1] - cleaned_response = '\n'.join(lines) - answer = output_parser.parse(cleaned_response) - else: - answer = raw_response.content + chain = chain | output_parser + answer = chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state