Fix document content special characters causing SQL syntax errors

- Add specialized document content cleaning function
- Handle backticks, quotes, newlines, and control characters properly
- Replace problematic characters instead of just escaping them
- Normalize whitespace and remove control characters
- Fix "Syntax error at or near" issues from document content like shell commands

This resolves SQL syntax errors when documents contain shell scripts,
code snippets, or other text with special formatting characters.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
pull/22551/head
yunqiqiliang 10 months ago
parent f116407045
commit 8dea8766e9

@ -321,16 +321,13 @@ class ClickzettaVector(BaseVector):
# For JSON column in Clickzetta, use safe JSON formatting
metadata_json = self._escape_json_string(doc.metadata)
embedding_str = self._format_vector(embedding)
escaped_content = self._escape_string(doc.page_content)
values.append(f"('{doc_id}', '{escaped_content}', "
cleaned_content = self._clean_document_content(doc.page_content)
values.append(f"('{doc_id}', '{cleaned_content}', "
f"JSON '{metadata_json}', {embedding_str})")
# Use regular INSERT - primary key will handle duplicates
insert_sql = f"""
INSERT INTO {self._config.schema}.{self._table_name}
(id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}, {Field.VECTOR.value})
VALUES {','.join(values)}
"""
columns = f"id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}, {Field.VECTOR.value}"
insert_sql = f"INSERT INTO {self._config.schema}.{self._table_name} ({columns}) VALUES {','.join(values)}"
with self._connection.cursor() as cursor:
cursor.execute(insert_sql)
@ -547,12 +544,16 @@ class ClickzettaVector(BaseVector):
return ""
# Replace single quotes and other potentially problematic characters
s = str(s)
s = s.replace("\\", "\\\\") # Escape backslashes first
s = s.replace("'", "''") # Escape single quotes
s = s.replace("\\", "\\\\") # Escape backslashes
s = s.replace("\n", "\\n") # Escape newlines
s = s.replace("\r", "\\r") # Escape carriage returns
s = s.replace("\t", "\\t") # Escape tabs
return s
s = s.replace("`", "\\`") # Escape backticks
s = s.replace('"', '\\"') # Escape double quotes
s = s.replace("\n", " ") # Replace newlines with spaces
s = s.replace("\r", " ") # Replace carriage returns with spaces
s = s.replace("\t", " ") # Replace tabs with spaces
# Remove any remaining control characters
s = ''.join(char for char in s if ord(char) >= 32 or char in [' '])
return s.strip()
def _format_vector(self, vector: list[float]) -> str:
"""Safely format vector for SQL, handling special float values."""
@ -595,6 +596,32 @@ class ClickzettaVector(BaseVector):
return str(uuid.uuid4())
return safe_id[:255] # Limit length
def _clean_document_content(self, content: str) -> str:
"""Clean document content for safe SQL insertion."""
if not content:
return ""
content = str(content)
# Remove or replace problematic characters that can break SQL
content = content.replace("'", "''") # SQL quote escaping
content = content.replace("\\", "\\\\") # Escape backslashes
content = content.replace("`", "'") # Replace backticks with single quotes
content = content.replace('"', "''") # Replace double quotes with escaped single quotes
# Replace line breaks and tabs with spaces to avoid multiline issues
content = content.replace("\n", " ")
content = content.replace("\r", " ")
content = content.replace("\t", " ")
# Remove control characters but keep printable ones
cleaned = ''.join(char if ord(char) >= 32 else ' ' for char in content)
# Normalize multiple spaces to single space
import re
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned.strip()
class ClickzettaVectorFactory(AbstractVectorFactory):
"""Factory for creating Clickzetta vector instances."""

Loading…
Cancel
Save