|
|
|
|
@ -116,7 +116,7 @@ class ReplicateLargeLanguageModel(_CommonReplicate, LargeLanguageModel):
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for key, value in input_properties:
|
|
|
|
|
if key not in ['system_prompt', 'prompt']:
|
|
|
|
|
if key not in ['system_prompt', 'prompt'] and 'stop' not in key:
|
|
|
|
|
value_type = value.get('type')
|
|
|
|
|
|
|
|
|
|
if not value_type:
|
|
|
|
|
@ -151,9 +151,17 @@ class ReplicateLargeLanguageModel(_CommonReplicate, LargeLanguageModel):
|
|
|
|
|
index = -1
|
|
|
|
|
current_completion: str = ""
|
|
|
|
|
stop_condition_reached = False
|
|
|
|
|
|
|
|
|
|
prediction_output_length = 10000
|
|
|
|
|
is_prediction_output_finished = False
|
|
|
|
|
|
|
|
|
|
for output in prediction.output_iterator():
|
|
|
|
|
current_completion += output
|
|
|
|
|
|
|
|
|
|
if not is_prediction_output_finished and prediction.status == 'succeeded':
|
|
|
|
|
prediction_output_length = len(prediction.output) - 1
|
|
|
|
|
is_prediction_output_finished = True
|
|
|
|
|
|
|
|
|
|
if stop:
|
|
|
|
|
for s in stop:
|
|
|
|
|
if s in current_completion:
|
|
|
|
|
@ -172,20 +180,30 @@ class ReplicateLargeLanguageModel(_CommonReplicate, LargeLanguageModel):
|
|
|
|
|
content=output if output else ''
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages)
|
|
|
|
|
completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message])
|
|
|
|
|
|
|
|
|
|
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
|
|
|
|
|
|
|
|
|
|
yield LLMResultChunk(
|
|
|
|
|
model=model,
|
|
|
|
|
prompt_messages=prompt_messages,
|
|
|
|
|
delta=LLMResultChunkDelta(
|
|
|
|
|
index=index,
|
|
|
|
|
message=assistant_prompt_message,
|
|
|
|
|
usage=usage,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
if index < prediction_output_length:
|
|
|
|
|
yield LLMResultChunk(
|
|
|
|
|
model=model,
|
|
|
|
|
prompt_messages=prompt_messages,
|
|
|
|
|
delta=LLMResultChunkDelta(
|
|
|
|
|
index=index,
|
|
|
|
|
message=assistant_prompt_message
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages)
|
|
|
|
|
completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message])
|
|
|
|
|
|
|
|
|
|
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
|
|
|
|
|
|
|
|
|
|
yield LLMResultChunk(
|
|
|
|
|
model=model,
|
|
|
|
|
prompt_messages=prompt_messages,
|
|
|
|
|
delta=LLMResultChunkDelta(
|
|
|
|
|
index=index,
|
|
|
|
|
message=assistant_prompt_message,
|
|
|
|
|
usage=usage
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _handle_generate_response(self, model: str, credentials: dict, prediction: Prediction, stop: list[str],
|
|
|
|
|
prompt_messages: list[PromptMessage]) -> LLMResult:
|
|
|
|
|
|