WebscraperTool bypass cloudflare site by cloudscraper (#6337)

pull/6371/head
Weaxs 2 years ago committed by GitHub
parent 7943f7f697
commit 4e2fba404d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import cloudscraper
import requests import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article from newspaper import Article
@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
if response.status_code != 200: if response.status_code == 200:
return "URL returned status code {}.".format(response.status_code) # check content-type
content_type = response.headers.get('Content-Type')
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition', '')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]
# check content-type if main_content_type not in supported_content_types:
content_type = response.headers.get('Content-Type') return "Unsupported content-type [{}] of URL.".format(main_content_type)
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition', '')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]
if main_content_type not in supported_content_types: if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return "Unsupported content-type [{}] of URL.".format(main_content_type) return ExtractProcessor.load_from_url(url, return_text=True)
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
return ExtractProcessor.load_from_url(url, return_text=True) elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
a = extract_using_readabilipy(response.text) a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():

32
api/poetry.lock generated

@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"] numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"] zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
[[package]]
name = "cloudscraper"
version = "1.2.71"
description = "A Python module to bypass Cloudflare's anti-bot page."
optional = false
python-versions = "*"
files = [
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
]
[package.dependencies]
pyparsing = ">=2.4.7"
requests = ">=2.9.2"
requests-toolbelt = ">=0.9.1"
[[package]] [[package]]
name = "cohere" name = "cohere"
version = "5.2.6" version = "5.2.6"
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
[package.extras] [package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"] rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "requests-toolbelt"
version = "1.0.0"
description = "A utility belt for advanced users of python-requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
{file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
]
[package.dependencies]
requests = ">=2.0.1,<3.0.0"
[[package]] [[package]]
name = "resend" name = "resend"
version = "0.7.2" version = "0.7.2"
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611" content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"

@ -193,6 +193,7 @@ twilio = "~9.0.4"
vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] } vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
wikipedia = "1.4.0" wikipedia = "1.4.0"
yfinance = "~0.2.40" yfinance = "~0.2.40"
cloudscraper = "1.2.71"
############################################################ ############################################################
# VDB dependencies required by vector store clients # VDB dependencies required by vector store clients

Loading…
Cancel
Save