This commit is contained in:
2026-05-08 11:44:42 +08:00
parent b08127031c
commit abf1f1ebea
249 changed files with 39411 additions and 887 deletions

View File

@@ -1,3 +1,5 @@
use std::io::{Cursor, Read};
use axum::{Json, extract::Extension, http::StatusCode};
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
use serde_json::{Value, json};
@@ -12,7 +14,7 @@ use crate::{
const MAX_DOCUMENT_INPUT_BYTES: usize = 256 * 1024;
const MAX_DOCUMENT_INPUT_BASE64_CHARS: usize = 360 * 1024;
const SUPPORTED_DOCUMENT_EXTENSIONS: &[&str] = &["txt", "md", "markdown", "csv", "json"];
const SUPPORTED_DOCUMENT_EXTENSIONS: &[&str] = &["txt", "md", "markdown", "docx", "csv", "json"];
pub async fn parse_creation_agent_document_input(
Extension(request_context): Extension<RequestContext>,
@@ -58,12 +60,8 @@ pub async fn parse_creation_agent_document_input(
);
}
let text = String::from_utf8(decoded.clone()).map_err(|_| {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "暂时只支持 UTF-8 文本文档,请转换编码后再上传。",
"field": "contentBase64",
}))
})?;
let extension = document_extension(&file_name)?;
let text = decode_document_text(&decoded, extension.as_str())?;
let normalized_text = normalize_document_text(&text);
if normalized_text.trim().is_empty() {
@@ -88,6 +86,7 @@ pub async fn parse_creation_agent_document_input(
.map(str::to_string),
size_bytes: decoded.len(),
text: normalized_text,
source_asset_id: None,
},
},
))
@@ -115,11 +114,7 @@ fn normalize_file_name(value: &str) -> Result<String, AppError> {
}
fn ensure_supported_extension(file_name: &str) -> Result<(), AppError> {
let extension = file_name
.rsplit_once('.')
.map(|(_, extension)| extension.trim().to_ascii_lowercase())
.filter(|extension| !extension.is_empty())
.ok_or_else(|| unsupported_document_error(file_name))?;
let extension = document_extension(file_name)?;
if !SUPPORTED_DOCUMENT_EXTENSIONS.contains(&extension.as_str()) {
return Err(unsupported_document_error(file_name));
@@ -128,15 +123,100 @@ fn ensure_supported_extension(file_name: &str) -> Result<(), AppError> {
Ok(())
}
fn document_extension(file_name: &str) -> Result<String, AppError> {
file_name
.rsplit_once('.')
.map(|(_, extension)| extension.trim().to_ascii_lowercase())
.filter(|extension| !extension.is_empty())
.ok_or_else(|| unsupported_document_error(file_name))
}
fn unsupported_document_error(file_name: &str) -> AppError {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "暂时只支持 txt、md、csv、json 文本文档。",
"message": "暂时只支持 txt、md、docx、csv、json 文档。",
"field": "fileName",
"fileName": file_name,
"supportedExtensions": SUPPORTED_DOCUMENT_EXTENSIONS,
}))
}
fn decode_document_text(bytes: &[u8], extension: &str) -> Result<String, AppError> {
if extension == "docx" {
return extract_docx_text(bytes);
}
String::from_utf8(bytes.to_vec()).map_err(|_| {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "暂时只支持 UTF-8 文本文档,请转换编码后再上传。",
"field": "contentBase64",
}))
})
}
fn extract_docx_text(bytes: &[u8]) -> Result<String, AppError> {
let reader = Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(reader).map_err(|_| {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "docx 文档结构无效,请重新选择文件。",
"field": "contentBase64",
}))
})?;
let mut document_xml = String::new();
archive
.by_name("word/document.xml")
.map_err(|_| {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "docx 文档缺少正文内容。",
"field": "contentBase64",
}))
})?
.read_to_string(&mut document_xml)
.map_err(|_| {
AppError::from_status(StatusCode::BAD_REQUEST).with_details(json!({
"message": "docx 文档正文读取失败。",
"field": "contentBase64",
}))
})?;
Ok(extract_docx_visible_text(document_xml.as_str()))
}
fn extract_docx_visible_text(xml: &str) -> String {
let mut output = String::new();
let mut cursor = 0usize;
while let Some(start_offset) = xml[cursor..].find("<w:t") {
let start = cursor + start_offset;
let Some(tag_end_offset) = xml[start..].find('>') else {
break;
};
let text_start = start + tag_end_offset + 1;
let Some(end_offset) = xml[text_start..].find("</w:t>") else {
break;
};
let text_end = text_start + end_offset;
output.push_str(&decode_xml_text(&xml[text_start..text_end]));
cursor = text_end + "</w:t>".len();
if let Some(next_break) = xml[cursor..].find("<w:br") {
if next_break == 0 {
output.push('\n');
}
}
}
output
}
fn decode_xml_text(value: &str) -> String {
value
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&apos;", "'")
.replace("&amp;", "&")
}
fn normalize_document_text(value: &str) -> String {
value
.trim_start_matches('\u{feff}')