fix wrong charset when decoding Chinese content (#6774)

Co-authored-by: zhangwb <zhangwb@zts.com.cn>
pull/6810/head
eric-0x72 2 years ago committed by GitHub
parent 53a89bbbc7
commit 98d9837fbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import chardet
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from regex import regex from regex import regex
@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
if response.status_code != 200: if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code) return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text) # Detect encoding using chardet
detected_encoding = chardet.detect(response.content)
encoding = detected_encoding['encoding']
if encoding:
try:
content = response.content.decode(encoding)
except (UnicodeDecodeError, TypeError):
content = response.text
else:
content = response.text
a = extract_using_readabilipy(content)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():
return '' return ''

Loading…
Cancel
Save