feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)
parent
fb49413a41
commit
369e1e6f58
@ -0,0 +1,35 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class JinaReaderWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("content", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
@ -0,0 +1,44 @@
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||
|
||||
|
||||
class JinaAuth(ApiKeyAuthBase):
|
||||
def __init__(self, credentials: dict):
|
||||
super().__init__(credentials)
|
||||
auth_type = credentials.get("auth_type")
|
||||
if auth_type != "bearer":
|
||||
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
|
||||
self.api_key = credentials.get("config").get("api_key", None)
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("No API key provided")
|
||||
|
||||
def validate_credentials(self):
|
||||
headers = self._prepare_headers()
|
||||
options = {
|
||||
"url": "https://example.com",
|
||||
}
|
||||
response = self._post_request("https://r.jina.ai", options, headers)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
self._handle_error(response)
|
||||
|
||||
def _prepare_headers(self):
|
||||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
def _post_request(self, url, data, headers):
|
||||
return requests.post(url, headers=headers, json=data)
|
||||
|
||||
def _handle_error(self, response):
|
||||
if response.status_code in {402, 409, 500}:
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
else:
|
||||
if response.text:
|
||||
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 2.7 KiB |
@ -0,0 +1,6 @@
|
||||
.jinaLogo {
|
||||
@apply w-4 h-4 bg-center bg-no-repeat inline-block;
|
||||
background-color: #F5FAFF;
|
||||
background-image: url(../assets/jina.png);
|
||||
background-size: 16px;
|
||||
}
|
||||
@ -0,0 +1,42 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
|
||||
import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onSetting: () => void
|
||||
}
|
||||
|
||||
const Header: FC<Props> = ({
|
||||
onSetting,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className='flex h-6 items-center justify-between'>
|
||||
<div className='flex items-center'>
|
||||
<div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
|
||||
<div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
|
||||
<div
|
||||
className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
|
||||
onClick={onSetting}
|
||||
>
|
||||
<Settings01 className='w-3.5 h-3.5 text-gray-500' />
|
||||
</div>
|
||||
</div>
|
||||
<a
|
||||
href='https://jina.ai/reader'
|
||||
target='_blank' rel='noopener noreferrer'
|
||||
className='flex items-center text-xs text-primary-600'
|
||||
>
|
||||
<BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
|
||||
{t(`${I18N_PREFIX}.jinaReaderDoc`)}
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Header)
|
||||
@ -0,0 +1,232 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import Options from './options'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
|
||||
import { sleep } from '@/utils'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
|
||||
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
enum Step {
|
||||
init = 'init',
|
||||
running = 'running',
|
||||
finished = 'finished',
|
||||
}
|
||||
|
||||
const JinaReader: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [step, setStep] = useState<Step>(Step.init)
|
||||
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||
useEffect(() => {
|
||||
if (step !== Step.init)
|
||||
setControlFoldOptions(Date.now())
|
||||
}, [step])
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const handleSetting = useCallback(() => {
|
||||
setShowAccountSettingModal({
|
||||
payload: 'data-source',
|
||||
})
|
||||
}, [setShowAccountSettingModal])
|
||||
|
||||
const checkValid = useCallback((url: string) => {
|
||||
let errorMsg = ''
|
||||
if (!url) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: 'url',
|
||||
})
|
||||
}
|
||||
|
||||
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||
|
||||
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: t(`${I18N_PREFIX}.limit`),
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
isValid: !errorMsg,
|
||||
errorMsg,
|
||||
}
|
||||
}, [crawlOptions, t])
|
||||
|
||||
const isInit = step === Step.init
|
||||
const isCrawlFinished = step === Step.finished
|
||||
const isRunning = step === Step.running
|
||||
const [crawlResult, setCrawlResult] = useState<{
|
||||
current: number
|
||||
total: number
|
||||
data: CrawlResultItem[]
|
||||
time_consuming: number | string
|
||||
} | undefined>(undefined)
|
||||
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||
const showError = isCrawlFinished && crawlErrorMessage
|
||||
|
||||
const waitForCrawlFinished = useCallback(async (jobId: string) => {
|
||||
try {
|
||||
const res = await checkJinaReaderTaskStatus(jobId) as any
|
||||
console.log('res', res)
|
||||
if (res.status === 'completed') {
|
||||
return {
|
||||
isError: false,
|
||||
data: {
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
},
|
||||
}
|
||||
}
|
||||
if (res.status === 'failed' || !res.status) {
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: res.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
// update the progress
|
||||
setCrawlResult({
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
})
|
||||
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||
await sleep(2500)
|
||||
return await waitForCrawlFinished(jobId)
|
||||
}
|
||||
catch (e: any) {
|
||||
const errorBody = await e.json()
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: errorBody.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
}, [crawlOptions.limit])
|
||||
|
||||
const handleRun = useCallback(async (url: string) => {
|
||||
const { isValid, errorMsg } = checkValid(url)
|
||||
if (!isValid) {
|
||||
Toast.notify({
|
||||
message: errorMsg!,
|
||||
type: 'error',
|
||||
})
|
||||
return
|
||||
}
|
||||
setStep(Step.running)
|
||||
try {
|
||||
const startTime = Date.now()
|
||||
const res = await createJinaReaderTask({
|
||||
url,
|
||||
options: crawlOptions,
|
||||
}) as any
|
||||
|
||||
if (res.data) {
|
||||
const data = {
|
||||
current: 1,
|
||||
total: 1,
|
||||
data: [{
|
||||
title: res.data.title,
|
||||
markdown: res.data.content,
|
||||
description: res.data.description,
|
||||
source_url: res.data.url,
|
||||
}],
|
||||
time_consuming: (Date.now() - startTime) / 1000,
|
||||
}
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || [])
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
else if (res.job_id) {
|
||||
const jobId = res.job_id
|
||||
onJobIdChange(jobId)
|
||||
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||
if (isError) {
|
||||
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||
}
|
||||
else {
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||
console.log(e)
|
||||
}
|
||||
finally {
|
||||
setStep(Step.finished)
|
||||
}
|
||||
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||
|
||||
return (
|
||||
<div>
|
||||
<Header onSetting={handleSetting} />
|
||||
<div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
|
||||
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||
<OptionsWrap
|
||||
className={cn('mt-4')}
|
||||
controlFoldOptions={controlFoldOptions}
|
||||
>
|
||||
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||
</OptionsWrap>
|
||||
|
||||
{!isInit && (
|
||||
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||
{isRunning
|
||||
&& <Crawling
|
||||
className='mt-2'
|
||||
crawledNum={crawlResult?.current || 0}
|
||||
totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
|
||||
/>}
|
||||
{showError && (
|
||||
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||
)}
|
||||
{isCrawlFinished && !showError
|
||||
&& <CrawledResult
|
||||
className='mb-2'
|
||||
list={crawlResult?.data || []}
|
||||
checkedList={checkedCrawlResult}
|
||||
onSelectedChange={onCheckedCrawlResultChange}
|
||||
onPreview={onPreview}
|
||||
usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||
/>
|
||||
}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(JinaReader)
|
||||
@ -0,0 +1,59 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
payload: CrawlOptions
|
||||
onChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
const Options: FC<Props> = ({
|
||||
className = '',
|
||||
payload,
|
||||
onChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||
return (value: any) => {
|
||||
onChange({
|
||||
...payload,
|
||||
[key]: value,
|
||||
})
|
||||
}
|
||||
}, [payload, onChange])
|
||||
return (
|
||||
<div className={cn(className, ' space-y-2')}>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||
isChecked={payload.crawl_sub_pages}
|
||||
onChange={handleChange('crawl_sub_pages')}
|
||||
/>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.useSitemap`)}
|
||||
isChecked={payload.use_sitemap}
|
||||
onChange={handleChange('use_sitemap')}
|
||||
tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
|
||||
/>
|
||||
<div className='flex justify-between space-x-4'>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.limit`)}
|
||||
value={payload.limit}
|
||||
onChange={handleChange('limit')}
|
||||
isNumber
|
||||
isRequired
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Options)
|
||||
@ -0,0 +1,140 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import {
|
||||
PortalToFollowElem,
|
||||
PortalToFollowElemContent,
|
||||
} from '@/app/components/base/portal-to-follow-elem'
|
||||
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
import Field from '@/app/components/datasets/create/website/base/field'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
||||
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
||||
type Props = {
|
||||
onCancel: () => void
|
||||
onSaved: () => void
|
||||
}
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.jinaReader'
|
||||
|
||||
const ConfigJinaReaderModal: FC<Props> = ({
|
||||
onCancel,
|
||||
onSaved,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [isSaving, setIsSaving] = useState(false)
|
||||
const [apiKey, setApiKey] = useState('')
|
||||
|
||||
const handleSave = useCallback(async () => {
|
||||
if (isSaving)
|
||||
return
|
||||
let errorMsg = ''
|
||||
if (!errorMsg) {
|
||||
if (!apiKey) {
|
||||
errorMsg = t('common.errorMsg.fieldRequired', {
|
||||
field: 'API Key',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if (errorMsg) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: errorMsg,
|
||||
})
|
||||
return
|
||||
}
|
||||
const postData = {
|
||||
category: 'website',
|
||||
provider: DataSourceProvider.jinaReader,
|
||||
credentials: {
|
||||
auth_type: 'bearer',
|
||||
config: {
|
||||
api_key: apiKey,
|
||||
},
|
||||
},
|
||||
}
|
||||
try {
|
||||
setIsSaving(true)
|
||||
await createDataSourceApiKeyBinding(postData)
|
||||
Toast.notify({
|
||||
type: 'success',
|
||||
message: t('common.api.success'),
|
||||
})
|
||||
}
|
||||
finally {
|
||||
setIsSaving(false)
|
||||
}
|
||||
|
||||
onSaved()
|
||||
}, [apiKey, onSaved, t, isSaving])
|
||||
|
||||
return (
|
||||
<PortalToFollowElem open>
|
||||
<PortalToFollowElemContent className='w-full h-full z-[60]'>
|
||||
<div className='fixed inset-0 flex items-center justify-center bg-black/[.25]'>
|
||||
<div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-white shadow-xl rounded-2xl overflow-y-auto'>
|
||||
<div className='px-8 pt-8'>
|
||||
<div className='flex justify-between items-center mb-4'>
|
||||
<div className='text-xl font-semibold text-gray-900'>{t(`${I18N_PREFIX}.configJinaReader`)}</div>
|
||||
</div>
|
||||
|
||||
<div className='space-y-4'>
|
||||
<Field
|
||||
label='API Key'
|
||||
labelClassName='!text-sm'
|
||||
isRequired
|
||||
value={apiKey}
|
||||
onChange={(value: string | number) => setApiKey(value as string)}
|
||||
placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
|
||||
/>
|
||||
</div>
|
||||
<div className='my-8 flex justify-between items-center h-8'>
|
||||
<a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-[#155EEF]' target='_blank' href='https://jina.ai/reader/'>
|
||||
<span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
|
||||
<LinkExternal02 className='w-3 h-3' />
|
||||
</a>
|
||||
<div className='flex'>
|
||||
<Button
|
||||
size='large'
|
||||
className='mr-2'
|
||||
onClick={onCancel}
|
||||
>
|
||||
{t('common.operation.cancel')}
|
||||
</Button>
|
||||
<Button
|
||||
variant='primary'
|
||||
size='large'
|
||||
onClick={handleSave}
|
||||
loading={isSaving}
|
||||
>
|
||||
{t('common.operation.save')}
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<div className='border-t-[0.5px] border-t-black/5'>
|
||||
<div className='flex justify-center items-center py-3 bg-gray-50 text-xs text-gray-500'>
|
||||
<Lock01 className='mr-1 w-3 h-3 text-gray-500' />
|
||||
{t('common.modelProvider.encrypted.front')}
|
||||
<a
|
||||
className='text-primary-600 mx-1'
|
||||
target='_blank' rel='noopener noreferrer'
|
||||
href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
|
||||
>
|
||||
PKCS1_OAEP
|
||||
</a>
|
||||
{t('common.modelProvider.encrypted.back')}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</PortalToFollowElemContent>
|
||||
</PortalToFollowElem>
|
||||
)
|
||||
}
|
||||
export default React.memo(ConfigJinaReaderModal)
|
||||
Loading…
Reference in New Issue