import warnings
from typing import Any, AsyncIterator, Iterator, List, Optional, Set, Union
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.web_base import WebBaseLoader
[docs]
class GitbookLoader(BaseLoader):
"""Load `GitBook` data.
1. load from either a single page, or
2. load all (relative) paths in the sitemap, handling nested sitemap indexes.
When `load_all_paths=True`, the loader parses XML sitemaps and requires the
`lxml` package to be installed (`pip install lxml`).
"""
[docs]
def __init__(
self,
web_page: str,
load_all_paths: bool = False,
base_url: Optional[str] = None,
content_selector: str = "main",
continue_on_failure: bool = False,
show_progress: bool = True,
*,
sitemap_url: Optional[str] = None,
allowed_domains: Optional[Set[str]] = None,
):
"""Initialize with web page and whether to load all paths.
Args:
web_page: The web page to load or the starting point from where
relative paths are discovered.
load_all_paths: If set to True, all relative paths in the navbar
are loaded instead of only `web_page`. Requires `lxml` package.
base_url: If `load_all_paths` is True, the relative paths are
appended to this base url. Defaults to `web_page`.
content_selector: The CSS selector for the content to load.
Defaults to "main".
continue_on_failure: whether to continue loading the sitemap if an error
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
sitemap_url: Custom sitemap URL to use when load_all_paths is True.
Defaults to "{base_url}/sitemap.xml".
allowed_domains: Optional set of allowed domains to fetch from.
If None (default), the loader will restrict crawling to the domain
of the `web_page` URL to prevent potential SSRF vulnerabilities.
Provide an explicit set (e.g., {"example.com", "docs.example.com"})
to allow crawling across multiple domains. Use with caution in
server environments where users might control the input URLs.
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
self.base_url = self.base_url[:-1]
self.web_page = web_page
self.load_all_paths = load_all_paths
self.content_selector = content_selector
self.continue_on_failure = continue_on_failure
self.show_progress = show_progress
self.allowed_domains = allowed_domains
# If allowed_domains is not specified, extract domain from web_page as default
if self.allowed_domains is None:
initial_domain = urlparse(web_page).netloc
if initial_domain:
self.allowed_domains = {initial_domain}
# Determine the starting URL (either a sitemap or a direct page)
if load_all_paths:
self.start_url = sitemap_url or f"{self.base_url}/sitemap.xml"
else:
self.start_url = web_page
# Validate the start_url is allowed
if not self._is_url_allowed(self.start_url):
raise ValueError(
f"Domain in {self.start_url} is not in the allowed domains list: "
f"{self.allowed_domains}"
)
def _is_url_allowed(self, url: str) -> bool:
"""Check if a URL has an allowed scheme and domain."""
# It's assumed self.allowed_domains is always set by __init__
# either explicitly or derived from web_page. If it's somehow still
# None here, it indicates an initialization issue, so denying is safer.
if self.allowed_domains is None:
return False # Should not happen if init worked
try:
parsed = urlparse(url)
# 1. Validate scheme (Minimal Enhancement)
if parsed.scheme not in ("http", "https"):
return False
# 2. Validate domain (Existing logic - handles suffix correctly)
# Ensure netloc is not empty before checking membership
if not parsed.netloc:
return False
return parsed.netloc in self.allowed_domains
except Exception: # Catch potential urlparse errors
return False
def _safe_add_url(
self, url_list: List[str], url: str, url_type: str = "URL"
) -> bool:
"""Safely add a URL to a list if it's from an allowed domain.
Args:
url_list: The list to add the URL to
url: The URL to add
url_type: Type of URL for warning message (e.g., "sitemap", "content")
Returns:
bool: True if URL was added, False if skipped
"""
if self._is_url_allowed(url):
url_list.append(url)
return True
else:
warnings.warn(f"Skipping disallowed {url_type} URL: {url}")
return False
def _create_web_loader(self, url_or_urls: Union[str, List[str]]) -> WebBaseLoader:
"""Create a new WebBaseLoader instance for the given URL(s).
This ensures each operation gets its own isolated WebBaseLoader.
"""
return WebBaseLoader(
web_path=url_or_urls,
continue_on_failure=self.continue_on_failure,
show_progress=self.show_progress,
)
def _is_sitemap_index(self, soup: BeautifulSoup) -> bool:
"""Check if the soup contains a sitemap index."""
return soup.find("sitemapindex") is not None
def _extract_sitemap_urls(self, soup: BeautifulSoup) -> List[str]:
"""Extract sitemap URLs from a sitemap index."""
sitemap_tags = soup.find_all("sitemap")
urls: List[str] = []
for sitemap in sitemap_tags:
loc = sitemap.find("loc")
if loc and loc.text:
self._safe_add_url(urls, loc.text, "sitemap")
return urls
def _process_sitemap(
self,
soup: BeautifulSoup,
processed_urls: Set[str],
web_loader: Optional[WebBaseLoader] = None,
) -> List[str]:
"""Process a sitemap, handling both direct content URLs and sitemap indexes.
Args:
soup: The BeautifulSoup object of the sitemap
processed_urls: Set of already processed URLs to avoid cycles
web_loader: WebBaseLoader instance to reuse for all requests,
created if None
"""
# Create a loader if not provided
if web_loader is None:
web_loader = self._create_web_loader(self.start_url)
# If it's a sitemap index, recursively process each sitemap URL
if self._is_sitemap_index(soup):
sitemap_urls = self._extract_sitemap_urls(soup)
all_content_urls = []
for sitemap_url in sitemap_urls:
if sitemap_url in processed_urls:
warnings.warn(
f"Skipping already processed sitemap URL: {sitemap_url}"
)
continue
processed_urls.add(sitemap_url)
try:
# Temporarily override the web_path of the loader
original_web_paths = web_loader.web_paths
web_loader.web_paths = [sitemap_url]
# Reuse the same loader for the next sitemap,
# explicitly use lxml-xml
sitemap_soup = web_loader.scrape(parser="lxml-xml")
# Restore original web_paths
web_loader.web_paths = original_web_paths
# Recursive call with the same loader
content_urls = self._process_sitemap(
sitemap_soup, processed_urls, web_loader
)
all_content_urls.extend(content_urls)
except Exception as e:
if self.continue_on_failure:
warnings.warn(f"Error processing sitemap {sitemap_url}: {e}")
else:
raise
return all_content_urls
else:
# It's a content sitemap, so extract content URLs
return self._get_paths(soup)
async def _aprocess_sitemap(
self,
soup: BeautifulSoup,
base_url: str,
processed_urls: Set[str],
web_loader: Optional[WebBaseLoader] = None,
) -> List[str]:
"""Async version of _process_sitemap.
Args:
soup: The BeautifulSoup object of the sitemap
base_url: The base URL for relative paths
processed_urls: Set of already processed URLs to avoid cycles
web_loader: WebBaseLoader instance to reuse for all requests,
created if None
"""
# Create a loader if not provided
if web_loader is None:
web_loader = self._create_web_loader(self.start_url)
# If it's a sitemap index, recursively process each sitemap URL
if self._is_sitemap_index(soup):
sitemap_urls = self._extract_sitemap_urls(soup)
all_content_urls = []
# Filter out already processed URLs
new_urls = [url for url in sitemap_urls if url not in processed_urls]
if not new_urls:
return []
# Update the web_paths of the loader to fetch all sitemaps at once
original_web_paths = web_loader.web_paths
web_loader.web_paths = new_urls
# Use the same WebBaseLoader's ascrape_all for efficient parallel
# fetching, explicitly use lxml-xml
soups = await web_loader.ascrape_all(new_urls, parser="lxml-xml")
# Restore original web_paths
web_loader.web_paths = original_web_paths
for sitemap_url, sitemap_soup in zip(new_urls, soups):
processed_urls.add(sitemap_url)
try:
# Recursive call with the same loader
content_urls = await self._aprocess_sitemap(
sitemap_soup, base_url, processed_urls, web_loader
)
all_content_urls.extend(content_urls)
except Exception as e:
if self.continue_on_failure:
warnings.warn(f"Error processing sitemap {sitemap_url}: {e}")
else:
raise
return all_content_urls
else:
# It's a content sitemap, so extract content URLs
return self._get_paths(soup)
[docs]
def lazy_load(self) -> Iterator[Document]:
"""Fetch text from one single GitBook page or recursively from sitemap."""
if not self.load_all_paths:
# Simple case: load a single page
temp_loader = self._create_web_loader(self.web_page)
soup = temp_loader.scrape()
doc = self._get_document(soup, self.web_page)
if doc:
yield doc
else:
# Get initial sitemap using the recursive method
temp_loader = self._create_web_loader(self.start_url)
# Explicitly use lxml-xml for parsing the initial sitemap
soup_info = temp_loader.scrape(parser="lxml-xml")
# Process sitemap(s) recursively to get all content URLs
processed_urls: Set[str] = set()
relative_paths = self._process_sitemap(soup_info, processed_urls)
if not relative_paths and self.show_progress:
warnings.warn(f"No content URLs found in sitemap at {self.start_url}")
# Build full URLs from relative paths
urls: List[str] = []
for url in relative_paths:
# URLs are now already absolute from _get_paths
self._safe_add_url(urls, url, "content")
if not urls:
return
# Create a loader for content pages
content_loader = self._create_web_loader(urls)
# Use WebBaseLoader to fetch all pages
soup_infos = content_loader.scrape_all(urls)
for soup_info, url in zip(soup_infos, urls):
doc = self._get_document(soup_info, url)
if doc:
yield doc
[docs]
async def alazy_load(self) -> AsyncIterator[Document]:
"""Asynchronously fetch text from GitBook page(s)."""
if not self.load_all_paths:
# Simple case: load a single page asynchronously
temp_loader = self._create_web_loader(self.web_page)
soups = await temp_loader.ascrape_all([self.web_page])
soup_info = soups[0]
doc = self._get_document(soup_info, self.web_page)
if doc:
yield doc
else:
# Get initial sitemap - web_loader will be created in _aprocess_sitemap
temp_loader = self._create_web_loader(self.start_url)
# Explicitly use lxml-xml for parsing the initial sitemap
soups = await temp_loader.ascrape_all([self.start_url], parser="lxml-xml")
soup_info = soups[0]
# Process sitemap(s) recursively to get all content URLs
processed_urls: Set[str] = set()
relative_paths = await self._aprocess_sitemap(
soup_info, self.base_url, processed_urls
)
if not relative_paths and self.show_progress:
warnings.warn(f"No content URLs found in sitemap at {self.start_url}")
# Build full URLs from relative paths
urls: List[str] = []
for url in relative_paths:
# URLs are now already absolute from _get_paths
self._safe_add_url(urls, url, "content")
if not urls:
return
# Create a loader for content pages
content_loader = self._create_web_loader(urls)
# Use WebBaseLoader's ascrape_all for efficient parallel fetching
soup_infos = await content_loader.ascrape_all(urls)
for soup_info, url in zip(soup_infos, urls):
maybe_doc = self._get_document(soup_info, url)
if maybe_doc is not None:
yield maybe_doc
def _get_document(
self, soup: Any, custom_url: Optional[str] = None
) -> Optional[Document]:
"""Fetch content from page and return Document."""
page_content_raw = soup.find(self.content_selector)
if not page_content_raw:
return None
content = page_content_raw.get_text(separator="\n").strip()
title_if_exists = page_content_raw.find("h1")
title = title_if_exists.text if title_if_exists else ""
metadata = {"source": custom_url or self.web_page, "title": title}
return Document(page_content=content, metadata=metadata)
def _get_paths(self, soup: Any) -> List[str]:
"""Fetch all URLs in the sitemap."""
urls = []
for loc in soup.find_all("loc"):
if loc.text:
# Instead of extracting just the path, keep the full URL
# to preserve domain information
urls.append(loc.text)
return urls