diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts.py b/api/core/model_runtime/model_providers/tongyi/tts/tts.py deleted file mode 100644 index 2e07d93d66..0000000000 --- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py +++ /dev/null @@ -1,155 +0,0 @@ -import threading -from queue import Queue -from typing import Any, Optional -from venv import logger - -import dashscope # type: ignore -from dashscope import SpeechSynthesizer # type: ignore -from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse # type: ignore -from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult # type: ignore - -from core.model_runtime.errors.invoke import InvokeBadRequestError -from core.model_runtime.errors.validate import CredentialsValidateFailedError -from core.model_runtime.model_providers.__base.tts_model import TTSModel -from core.model_runtime.model_providers.tongyi._common import _CommonTongyi - - -class TongyiText2SpeechModel(_CommonTongyi, TTSModel): - """ - Model class for Tongyi Speech to text model. - """ - - def _invoke( - self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, user: Optional[str] = None - ) -> Any: - """ - _invoke text2speech model - - :param model: model name - :param tenant_id: user tenant id - :param credentials: model credentials - :param voice: model timbre - :param content_text: text content to be translated - :param user: unique user id - :return: text translated to audio file - """ - if not voice or voice not in [ - d["value"] for d in self.get_tts_model_voices(model=model, credentials=credentials) - ]: - voice = self._get_model_default_voice(model, credentials) - - return self._tts_invoke_streaming(model=model, credentials=credentials, content_text=content_text, voice=voice) - - def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: - """ - validate credentials text2speech model - - :param model: model name - :param credentials: model credentials - :param user: unique user id - :return: text translated to audio file - """ - try: - self._tts_invoke_streaming( - model=model, - credentials=credentials, - content_text="Hello Dify!", - voice=self._get_model_default_voice(model, credentials), - ) - except Exception as ex: - raise CredentialsValidateFailedError(str(ex)) - - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, voice: str) -> Any: - """ - _tts_invoke_streaming text2speech model - - :param model: model name - :param credentials: model credentials - :param voice: model timbre - :param content_text: text content to be translated - :return: text translated to audio file - """ - word_limit = self._get_model_word_limit(model, credentials) - audio_type = self._get_model_audio_type(model, credentials) - logger.info(f"TONGYI: summary content length is less than word limit, content: {content_text}, voice: {voice}") - try: - audio_queue: Queue = Queue() - callback = Callback(queue=audio_queue) - - def invoke_remote(content, v, api_key, cb, at, wl): - if len(content) < word_limit: - logger.info(f"TONGYI: content length is less than word limit, content: {content}, voice: {v}") - sentences = [content] - else: - sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl)) - for sentence in sentences: - SpeechSynthesizer.call( - model=v, - sample_rate=16000, - api_key=api_key, - text=sentence.strip(), - callback=cb, - format=at, - word_timestamp_enabled=True, - phoneme_timestamp_enabled=True, - ) - - threading.Thread( - target=invoke_remote, - args=(content_text, voice, credentials.get("dashscope_api_key"), callback, audio_type, word_limit), - ).start() - - while True: - audio = audio_queue.get() - if audio is None: - break - yield audio - - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - - @staticmethod - def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str): - """ - _tts_invoke Tongyi text2speech model api - - :param credentials: model credentials - :param sentence: text content to be translated - :param voice: model timbre - :param audio_type: audio file type - :return: text translated to audio file - """ - response = dashscope.audio.tts.SpeechSynthesizer.call( - model=voice, - sample_rate=48000, - api_key=credentials.get("dashscope_api_key"), - text=sentence.strip(), - format=audio_type, - ) - if isinstance(response.get_audio_data(), bytes): - return response.get_audio_data() - - -class Callback(ResultCallback): - def __init__(self, queue: Queue): - self._queue = queue - - def on_open(self): - pass - - def on_complete(self): - self._queue.put(None) - self._queue.task_done() - - def on_error(self, response: SpeechSynthesisResponse): - self._queue.put(None) - self._queue.task_done() - - def on_close(self): - self._queue.put(None) - self._queue.task_done() - - def on_event(self, result: SpeechSynthesisResult): - ad = result.get_audio_frame() - if ad: - self._queue.put(ad)