@ -81,33 +81,66 @@ class AnswersSummaryAnalysisApi(Resource):
)
)
def _read_file_with_encoding_detection ( self , file_id : str ) - > Tuple [ Optional [ str ] , Optional [ str ] ] :
def _read_file_with_encoding_detection ( self , file_id : str ) - > Tuple [ Optional [ str ] , Optional [ str ] ] :
""" Read file content with automatic encoding detection. """
""" Read file content with automatic encoding detection.
Supports both CSV and XLSX files , converting XLSX to CSV text format .
"""
try :
try :
upload_file = db . session . query ( UploadFile ) . filter ( UploadFile . id == file_id ) . first ( )
upload_file = db . session . query ( UploadFile ) . filter ( UploadFile . id == file_id ) . first ( )
if not upload_file :
return None , None
# Get the file content from storage
# Get the file content from storage
file_content = storage . load_once ( upload_file . key )
file_content = storage . load_once ( upload_file . key )
# Detect the encoding
# Check if the file is Excel (.xlsx) based on filename or mime type
detection = chardet . detect ( file_content )
file_extension = upload_file . name . split ( ' . ' ) [ - 1 ] . lower ( ) if upload_file . name else ' '
encoding = detection . get ( ' encoding ' , ' utf-8 ' )
mime_type = upload_file . mime_type if upload_file . mime_type else ' '
is_excel = (
file_extension == ' xlsx '
or mime_type == ' application/vnd.openxmlformats-officedocument.spreadsheetml.sheet '
)
if is_excel :
# Process Excel file
import io
# Try multiple encodings if needed
import pandas as pd
encodings_to_try = [ encoding , ' utf-8 ' , ' gbk ' , ' gb2312 ' , ' iso-8859-1 ' , ' latin-1 ' ]
# Filter out any None values
encodings_to_try = [ enc for enc in encodings_to_try if enc is not None ]
decoded_content = None
detected_encoding = None
for enc in encodings_to_try :
# Load Excel data
excel_data = io . BytesIO ( file_content )
try :
try :
decoded_content = file_content . decode ( enc )
# Read all sheets, default to first sheet
detected_encoding = enc
df = pd . read_excel ( excel_data , engine = ' openpyxl ' )
break
except UnicodeDecodeError :
# Convert DataFrame to CSV string
continue
csv_content = df . to_csv ( index = False )
return csv_content , ' utf-8 '
except Exception as e :
print ( f " Error converting Excel file: { str ( e ) } " )
return None , None
else :
# Process CSV file with encoding detection
# Detect the encoding
detection = chardet . detect ( file_content )
encoding = detection . get ( ' encoding ' , ' utf-8 ' )
# Try multiple encodings if needed
encodings_to_try = [ encoding , ' utf-8 ' , ' gbk ' , ' gb2312 ' , ' iso-8859-1 ' , ' latin-1 ' ]
# Filter out any None values
encodings_to_try = [ enc for enc in encodings_to_try if enc is not None ]
decoded_content = None
detected_encoding = None
for enc in encodings_to_try :
try :
decoded_content = file_content . decode ( enc )
detected_encoding = enc
break
except UnicodeDecodeError :
continue
return decoded_content , detected_encoding
return decoded_content , detected_encoding
except Exception as e :
except Exception as e :
print ( f " Error reading file: { str ( e ) } " )
print ( f " Error reading file: { str ( e ) } " )
return None , None
return None , None