diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index d42fd99fce..d0a4aea5ab 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -1,20 +1,18 @@ import hashlib -import json import mimetypes import os import re import site -import subprocess -import tempfile import unicodedata from contextlib import contextmanager -from pathlib import Path +from dataclasses import dataclass from typing import Any, Literal, Optional, cast from urllib.parse import unquote import chardet import cloudscraper # type: ignore from bs4 import BeautifulSoup, CData, Comment, NavigableString # type: ignore +from readability import Document # type: ignore from regex import regex # type: ignore from core.helper import ssrf_proxy @@ -23,9 +21,7 @@ from core.rag.extractor.extract_processor import ExtractProcessor FULL_TEMPLATE = """ TITLE: {title} -AUTHORS: {authors} -PUBLISH DATE: {publish_date} -TOP_IMAGE_URL: {top_image} +AUTHOR: {author} TEXT: {text} @@ -90,66 +86,40 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str: else: content = response.text - a = extract_using_readabilipy(content) + article = extract_using_readabilipy(content) - if not a["plain_text"] or not a["plain_text"].strip(): + if not article.text.strip(): return "" res = FULL_TEMPLATE.format( - title=a["title"], - authors=a["byline"], - publish_date=a["date"], - top_image="", - text=a["plain_text"] or "", + title=article.title, + author=article.auther, + text=article.text, ) return res -def extract_using_readabilipy(html): - with tempfile.NamedTemporaryFile(delete=False, mode="w+") as f_html: - f_html.write(html) - f_html.close() - html_path = f_html.name - - # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file - article_json_path = html_path + ".json" - jsdir = os.path.join(find_module_path("readabilipy"), "javascript") - with chdir(jsdir): - subprocess.check_call(["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path]) - - # Read output of call to Readability.parse() from JSON file and return as Python dictionary - input_json = json.loads(Path(article_json_path).read_text(encoding="utf-8")) - - # Deleting files after processing - os.unlink(article_json_path) - os.unlink(html_path) - - article_json: dict[str, Any] = { - "title": None, - "byline": None, - "date": None, - "content": None, - "plain_content": None, - "plain_text": None, - } - # Populate article fields from readability fields where present - if input_json: - if input_json.get("title"): - article_json["title"] = input_json["title"] - if input_json.get("byline"): - article_json["byline"] = input_json["byline"] - if input_json.get("date"): - article_json["date"] = input_json["date"] - if input_json.get("content"): - article_json["content"] = input_json["content"] - article_json["plain_content"] = plain_content(article_json["content"], False, False) - article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"]) - if input_json.get("textContent"): - article_json["plain_text"] = input_json["textContent"] - article_json["plain_text"] = re.sub(r"\n\s*\n", "\n", article_json["plain_text"]) - - return article_json +@dataclass +class Article: + title: str + auther: str + text: str + + +def extract_using_readabilipy(html: str): + doc = Document(html) + article = Article( + title=doc.title(), + auther=doc.author(), + text=plain_content( + readability_content=doc.content(), + content_digests=False, + node_indexes=False, + ), + ) + + return article def find_module_path(module_name): diff --git a/api/pyproject.toml b/api/pyproject.toml index 1c6adb6587..7f0c2e98b5 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -83,6 +83,7 @@ dependencies = [ "weave~=0.51.0", "yarl~=1.18.3", "webvtt-py~=0.5.1", + "readability-lxml~=0.8.4.1", ] # Before adding new dependency, consider place it in # alphabet order (a-z) and suitable group. diff --git a/api/uv.lock b/api/uv.lock index 033dc8762b..d723bd5936 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -1151,6 +1151,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 }, ] +[[package]] +name = "cssselect" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 }, +] + [[package]] name = "dataclasses-json" version = "0.6.7" @@ -1276,6 +1285,7 @@ dependencies = [ { name = "python-dotenv" }, { name = "pyyaml" }, { name = "readabilipy" }, + { name = "readability-lxml" }, { name = "redis", extra = ["hiredis"] }, { name = "resend" }, { name = "sentry-sdk", extra = ["flask"] }, @@ -1447,6 +1457,7 @@ requires-dist = [ { name = "python-dotenv", specifier = "==1.0.1" }, { name = "pyyaml", specifier = "~=6.0.1" }, { name = "readabilipy", specifier = "~=0.3.0" }, + { name = "readability-lxml", specifier = "~=0.8.4.1" }, { name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" }, { name = "resend", specifier = "~=2.9.0" }, { name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" }, @@ -2931,6 +2942,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 }, ] +[package.optional-dependencies] +html-clean = [ + { name = "lxml-html-clean" }, +] + +[[package]] +name = "lxml-html-clean" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184 }, +] + [[package]] name = "lxml-stubs" version = "0.5.1" @@ -4838,6 +4866,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 }, ] +[[package]] +name = "readability-lxml" +version = "0.8.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "chardet" }, + { name = "cssselect" }, + { name = "lxml", extra = ["html-clean"] }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912 }, +] + [[package]] name = "realtime" version = "2.4.3"