1
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
use axum::{Json, extract::Extension, http::StatusCode};
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
|
||||
use serde_json::{Value, json};
|
||||
@@ -12,7 +14,7 @@ use crate::{
|
||||
|
||||
const MAX_DOCUMENT_INPUT_BYTES: usize = 256 * 1024;
|
||||
const MAX_DOCUMENT_INPUT_BASE64_CHARS: usize = 360 * 1024;
|
||||
const SUPPORTED_DOCUMENT_EXTENSIONS: &[&str] = &["txt", "md", "markdown", "csv", "json"];
|
||||
const SUPPORTED_DOCUMENT_EXTENSIONS: &[&str] = &["txt", "md", "markdown", "docx", "csv", "json"];
|
||||
|
||||
pub async fn parse_creation_agent_document_input(
|
||||
Extension(request_context): Extension<RequestContext>,
|
||||
@@ -58,12 +60,8 @@ pub async fn parse_creation_agent_document_input(
|
||||
);
|
||||
}
|
||||
|
||||
let text = String::from_utf8(decoded.clone()).map_err(|_| {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "暂时只支持 UTF-8 文本文档,请转换编码后再上传。",
|
||||
"field": "contentBase64",
|
||||
}))
|
||||
})?;
|
||||
let extension = document_extension(&file_name)?;
|
||||
let text = decode_document_text(&decoded, extension.as_str())?;
|
||||
let normalized_text = normalize_document_text(&text);
|
||||
|
||||
if normalized_text.trim().is_empty() {
|
||||
@@ -88,6 +86,7 @@ pub async fn parse_creation_agent_document_input(
|
||||
.map(str::to_string),
|
||||
size_bytes: decoded.len(),
|
||||
text: normalized_text,
|
||||
source_asset_id: None,
|
||||
},
|
||||
},
|
||||
))
|
||||
@@ -115,11 +114,7 @@ fn normalize_file_name(value: &str) -> Result<String, AppError> {
|
||||
}
|
||||
|
||||
fn ensure_supported_extension(file_name: &str) -> Result<(), AppError> {
|
||||
let extension = file_name
|
||||
.rsplit_once('.')
|
||||
.map(|(_, extension)| extension.trim().to_ascii_lowercase())
|
||||
.filter(|extension| !extension.is_empty())
|
||||
.ok_or_else(|| unsupported_document_error(file_name))?;
|
||||
let extension = document_extension(file_name)?;
|
||||
|
||||
if !SUPPORTED_DOCUMENT_EXTENSIONS.contains(&extension.as_str()) {
|
||||
return Err(unsupported_document_error(file_name));
|
||||
@@ -128,15 +123,100 @@ fn ensure_supported_extension(file_name: &str) -> Result<(), AppError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn document_extension(file_name: &str) -> Result<String, AppError> {
|
||||
file_name
|
||||
.rsplit_once('.')
|
||||
.map(|(_, extension)| extension.trim().to_ascii_lowercase())
|
||||
.filter(|extension| !extension.is_empty())
|
||||
.ok_or_else(|| unsupported_document_error(file_name))
|
||||
}
|
||||
|
||||
fn unsupported_document_error(file_name: &str) -> AppError {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "暂时只支持 txt、md、csv、json 文本文档。",
|
||||
"message": "暂时只支持 txt、md、docx、csv、json 文档。",
|
||||
"field": "fileName",
|
||||
"fileName": file_name,
|
||||
"supportedExtensions": SUPPORTED_DOCUMENT_EXTENSIONS,
|
||||
}))
|
||||
}
|
||||
|
||||
fn decode_document_text(bytes: &[u8], extension: &str) -> Result<String, AppError> {
|
||||
if extension == "docx" {
|
||||
return extract_docx_text(bytes);
|
||||
}
|
||||
|
||||
String::from_utf8(bytes.to_vec()).map_err(|_| {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "暂时只支持 UTF-8 文本文档,请转换编码后再上传。",
|
||||
"field": "contentBase64",
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_docx_text(bytes: &[u8]) -> Result<String, AppError> {
|
||||
let reader = Cursor::new(bytes);
|
||||
let mut archive = zip::ZipArchive::new(reader).map_err(|_| {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "docx 文档结构无效,请重新选择文件。",
|
||||
"field": "contentBase64",
|
||||
}))
|
||||
})?;
|
||||
let mut document_xml = String::new();
|
||||
archive
|
||||
.by_name("word/document.xml")
|
||||
.map_err(|_| {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "docx 文档缺少正文内容。",
|
||||
"field": "contentBase64",
|
||||
}))
|
||||
})?
|
||||
.read_to_string(&mut document_xml)
|
||||
.map_err(|_| {
|
||||
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
|
||||
"message": "docx 文档正文读取失败。",
|
||||
"field": "contentBase64",
|
||||
}))
|
||||
})?;
|
||||
|
||||
Ok(extract_docx_visible_text(document_xml.as_str()))
|
||||
}
|
||||
|
||||
fn extract_docx_visible_text(xml: &str) -> String {
|
||||
let mut output = String::new();
|
||||
let mut cursor = 0usize;
|
||||
|
||||
while let Some(start_offset) = xml[cursor..].find("<w:t") {
|
||||
let start = cursor + start_offset;
|
||||
let Some(tag_end_offset) = xml[start..].find('>') else {
|
||||
break;
|
||||
};
|
||||
let text_start = start + tag_end_offset + 1;
|
||||
let Some(end_offset) = xml[text_start..].find("</w:t>") else {
|
||||
break;
|
||||
};
|
||||
let text_end = text_start + end_offset;
|
||||
output.push_str(&decode_xml_text(&xml[text_start..text_end]));
|
||||
cursor = text_end + "</w:t>".len();
|
||||
|
||||
if let Some(next_break) = xml[cursor..].find("<w:br") {
|
||||
if next_break == 0 {
|
||||
output.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
fn decode_xml_text(value: &str) -> String {
|
||||
value
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("&", "&")
|
||||
}
|
||||
|
||||
fn normalize_document_text(value: &str) -> String {
|
||||
value
|
||||
.trim_start_matches('\u{feff}')
|
||||
|
||||
Reference in New Issue
Block a user