From 8dea8766e9a96d35ce03feb9bbc92a9ff4bda0ed Mon Sep 17 00:00:00 2001 From: yunqiqiliang <132561395+yunqiqiliang@users.noreply.github.com> Date: Thu, 17 Jul 2025 17:45:48 +0800 Subject: [PATCH] Fix document content special characters causing SQL syntax errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add specialized document content cleaning function - Handle backticks, quotes, newlines, and control characters properly - Replace problematic characters instead of just escaping them - Normalize whitespace and remove control characters - Fix "Syntax error at or near" issues from document content like shell commands This resolves SQL syntax errors when documents contain shell scripts, code snippets, or other text with special formatting characters. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../vdb/clickzetta/clickzetta_vector.py | 51 ++++++++++++++----- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py b/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py index b484f0cb6b..57261a4442 100644 --- a/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py +++ b/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py @@ -321,16 +321,13 @@ class ClickzettaVector(BaseVector): # For JSON column in Clickzetta, use safe JSON formatting metadata_json = self._escape_json_string(doc.metadata) embedding_str = self._format_vector(embedding) - escaped_content = self._escape_string(doc.page_content) - values.append(f"('{doc_id}', '{escaped_content}', " + cleaned_content = self._clean_document_content(doc.page_content) + values.append(f"('{doc_id}', '{cleaned_content}', " f"JSON '{metadata_json}', {embedding_str})") # Use regular INSERT - primary key will handle duplicates - insert_sql = f""" - INSERT INTO {self._config.schema}.{self._table_name} - (id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}, {Field.VECTOR.value}) - VALUES {','.join(values)} - """ + columns = f"id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}, {Field.VECTOR.value}" + insert_sql = f"INSERT INTO {self._config.schema}.{self._table_name} ({columns}) VALUES {','.join(values)}" with self._connection.cursor() as cursor: cursor.execute(insert_sql) @@ -547,12 +544,16 @@ class ClickzettaVector(BaseVector): return "" # Replace single quotes and other potentially problematic characters s = str(s) + s = s.replace("\\", "\\\\") # Escape backslashes first s = s.replace("'", "''") # Escape single quotes - s = s.replace("\\", "\\\\") # Escape backslashes - s = s.replace("\n", "\\n") # Escape newlines - s = s.replace("\r", "\\r") # Escape carriage returns - s = s.replace("\t", "\\t") # Escape tabs - return s + s = s.replace("`", "\\`") # Escape backticks + s = s.replace('"', '\\"') # Escape double quotes + s = s.replace("\n", " ") # Replace newlines with spaces + s = s.replace("\r", " ") # Replace carriage returns with spaces + s = s.replace("\t", " ") # Replace tabs with spaces + # Remove any remaining control characters + s = ''.join(char for char in s if ord(char) >= 32 or char in [' ']) + return s.strip() def _format_vector(self, vector: list[float]) -> str: """Safely format vector for SQL, handling special float values.""" @@ -594,6 +595,32 @@ class ClickzettaVector(BaseVector): if not safe_id: # If all characters were removed return str(uuid.uuid4()) return safe_id[:255] # Limit length + + def _clean_document_content(self, content: str) -> str: + """Clean document content for safe SQL insertion.""" + if not content: + return "" + + content = str(content) + # Remove or replace problematic characters that can break SQL + content = content.replace("'", "''") # SQL quote escaping + content = content.replace("\\", "\\\\") # Escape backslashes + content = content.replace("`", "'") # Replace backticks with single quotes + content = content.replace('"', "''") # Replace double quotes with escaped single quotes + + # Replace line breaks and tabs with spaces to avoid multiline issues + content = content.replace("\n", " ") + content = content.replace("\r", " ") + content = content.replace("\t", " ") + + # Remove control characters but keep printable ones + cleaned = ''.join(char if ord(char) >= 32 else ' ' for char in content) + + # Normalize multiple spaces to single space + import re + cleaned = re.sub(r'\s+', ' ', cleaned) + + return cleaned.strip() class ClickzettaVectorFactory(AbstractVectorFactory):