This commit is contained in:
2026-05-11 20:27:41 +08:00
parent e30b733b17
commit 481a27fc53
60 changed files with 6357 additions and 1100 deletions

View File

@@ -13,7 +13,7 @@ use module_assets::{
use platform_oss::{LegacyAssetPrefix, OssObjectAccess, OssPutObjectRequest};
use reqwest::header;
use serde_json::{Map, Value, json};
use shared_contracts::visual_novel as contract;
use shared_contracts::{creation_audio, visual_novel as contract};
use crate::{
api_response::json_success_body, auth::AuthenticatedAccessToken, http_error::AppError,
@@ -51,6 +51,17 @@ struct DownloadedAudio {
extension: String,
}
#[derive(Clone, Debug)]
struct AudioAssetBindingTarget {
entity_kind: String,
entity_id: String,
slot: String,
asset_kind: String,
profile_id: Option<String>,
storage_prefix: LegacyAssetPrefix,
storage_scope: String,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum AudioAssetSlot {
BackgroundMusic,
@@ -58,13 +69,6 @@ enum AudioAssetSlot {
}
impl AudioAssetSlot {
fn contract_kind(self) -> contract::VisualNovelAudioGenerationKind {
match self {
Self::BackgroundMusic => contract::VisualNovelAudioGenerationKind::BackgroundMusic,
Self::SoundEffect => contract::VisualNovelAudioGenerationKind::SoundEffect,
}
}
fn provider(self) -> &'static str {
match self {
Self::BackgroundMusic => VECTOR_ENGINE_SUNO_PROVIDER,
@@ -92,6 +96,13 @@ impl AudioAssetSlot {
Self::SoundEffect => "sound-effect",
}
}
fn creation_contract_kind(self) -> creation_audio::CreationAudioGenerationKind {
match self {
Self::BackgroundMusic => creation_audio::CreationAudioGenerationKind::BackgroundMusic,
Self::SoundEffect => creation_audio::CreationAudioGenerationKind::SoundEffect,
}
}
}
pub async fn create_visual_novel_background_music_task(
@@ -148,6 +159,25 @@ pub async fn create_visual_novel_background_music_task(
))
}
pub async fn create_background_music_task(
State(state): State<AppState>,
axum::extract::Extension(request_context): axum::extract::Extension<RequestContext>,
payload: Result<Json<creation_audio::CreateBackgroundMusicRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let Json(payload) = parse_json_payload(&request_context, payload)?;
create_background_music_task_response(
&state,
&request_context,
payload.prompt,
payload.title,
payload.tags,
payload.model,
)
.await
.map(|payload| json_success_body(Some(&request_context), payload))
.map_err(|error| error.into_response_with_context(Some(&request_context)))
}
pub async fn create_visual_novel_sound_effect_task(
State(state): State<AppState>,
axum::extract::Extension(request_context): axum::extract::Extension<RequestContext>,
@@ -198,6 +228,116 @@ pub async fn create_visual_novel_sound_effect_task(
))
}
pub async fn create_sound_effect_task(
State(state): State<AppState>,
axum::extract::Extension(request_context): axum::extract::Extension<RequestContext>,
payload: Result<Json<creation_audio::CreateSoundEffectRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let Json(payload) = parse_json_payload(&request_context, payload)?;
create_sound_effect_task_response(&state, payload.prompt, payload.duration, payload.seed)
.await
.map(|payload| json_success_body(Some(&request_context), payload))
.map_err(|error| error.into_response_with_context(Some(&request_context)))
}
async fn create_background_music_task_response(
state: &AppState,
_request_context: &RequestContext,
prompt: String,
title: String,
tags: Option<String>,
model: Option<String>,
) -> Result<creation_audio::AudioGenerationTaskResponse, AppError> {
let settings = require_vector_engine_audio_settings(state)?;
let http_client = build_vector_engine_audio_http_client(&settings)?;
let prompt = normalize_limited_text(&prompt, "prompt", SUNO_PROMPT_MAX_CHARS)?;
let title = normalize_limited_text(&title, "title", SUNO_TITLE_MAX_CHARS)?;
let tags = tags
.as_deref()
.map(|value| normalize_limited_text(value, "tags", SUNO_TAGS_MAX_CHARS))
.transpose()?;
let model =
normalize_optional_text(model.as_deref()).unwrap_or_else(|| SUNO_DEFAULT_MODEL.to_string());
let mut body = Map::from_iter([
("prompt".to_string(), Value::String(prompt)),
("mv".to_string(), Value::String(model)),
("title".to_string(), Value::String(title)),
("task".to_string(), Value::String("generate".to_string())),
]);
if let Some(tags) = tags {
body.insert("tags".to_string(), Value::String(tags));
}
let response = post_vector_engine_json(
&http_client,
&settings,
"/suno/submit/music",
Value::Object(body),
"提交 Suno 背景音乐任务失败",
)
.await?;
let task_id = extract_string_by_path(&response, &["data"])
.or_else(|| find_first_string_by_key(&response, "task_id"))
.or_else(|| find_first_string_by_key(&response, "taskId"))
.ok_or_else(|| {
vector_engine_bad_gateway("提交 Suno 背景音乐任务失败:上游未返回任务 ID")
})?;
Ok(creation_audio::AudioGenerationTaskResponse {
kind: creation_audio::CreationAudioGenerationKind::BackgroundMusic,
task_id,
provider: VECTOR_ENGINE_SUNO_PROVIDER.to_string(),
status: "submitted".to_string(),
})
}
async fn create_sound_effect_task_response(
state: &AppState,
prompt: String,
duration: Option<u8>,
seed: Option<u64>,
) -> Result<creation_audio::AudioGenerationTaskResponse, AppError> {
let settings = require_vector_engine_audio_settings(state)?;
let http_client = build_vector_engine_audio_http_client(&settings)?;
let prompt = normalize_limited_text(&prompt, "prompt", VIDU_PROMPT_MAX_CHARS)?;
let duration = duration
.unwrap_or(DEFAULT_SOUND_EFFECT_DURATION_SECONDS)
.clamp(2, 10);
let mut body = Map::from_iter([
(
"model".to_string(),
Value::String(VIDU_AUDIO_MODEL.to_string()),
),
("prompt".to_string(), Value::String(prompt)),
("duration".to_string(), json!(duration)),
]);
if let Some(seed) = seed {
body.insert("seed".to_string(), json!(seed));
}
let response = post_vector_engine_json(
&http_client,
&settings,
"/ent/v2/text2audio",
Value::Object(body),
"提交 Vidu 音效任务失败",
)
.await?;
let task_id = find_first_string_by_key(&response, "task_id")
.or_else(|| find_first_string_by_key(&response, "taskId"))
.ok_or_else(|| vector_engine_bad_gateway("提交 Vidu 音效任务失败:上游未返回任务 ID"))?;
let status = find_first_string_by_key(&response, "state").unwrap_or_else(|| "created".into());
Ok(creation_audio::AudioGenerationTaskResponse {
kind: creation_audio::CreationAudioGenerationKind::SoundEffect,
task_id,
provider: VECTOR_ENGINE_VIDU_PROVIDER.to_string(),
status,
})
}
pub async fn publish_visual_novel_background_music_asset(
State(state): State<AppState>,
Path(task_id): Path<String>,
@@ -205,16 +345,30 @@ pub async fn publish_visual_novel_background_music_asset(
axum::extract::Extension(authenticated): axum::extract::Extension<AuthenticatedAccessToken>,
payload: Result<Json<contract::PublishVisualNovelGeneratedAudioAssetRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let payload = parse_json_payload(&request_context, payload)?.0;
let target = build_visual_novel_audio_target(payload, AudioAssetSlot::BackgroundMusic)?;
publish_generated_audio_asset(
&state,
&request_context,
authenticated.claims().user_id(),
task_id,
parse_json_payload(&request_context, payload)?.0,
AudioAssetSlot::BackgroundMusic,
target,
)
.await
.map(|payload| json_success_body(Some(&request_context), payload))
.map(|payload| {
json_success_body(
Some(&request_context),
contract::VisualNovelGeneratedAudioAssetResponse {
kind: contract::VisualNovelAudioGenerationKind::BackgroundMusic,
task_id: payload.task_id,
provider: payload.provider,
status: payload.status,
asset_object_id: payload.asset_object_id,
asset_kind: payload.asset_kind,
audio_src: payload.audio_src,
},
)
})
.map_err(|error| error.into_response_with_context(Some(&request_context)))
}
@@ -225,13 +379,69 @@ pub async fn publish_visual_novel_sound_effect_asset(
axum::extract::Extension(authenticated): axum::extract::Extension<AuthenticatedAccessToken>,
payload: Result<Json<contract::PublishVisualNovelGeneratedAudioAssetRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let payload = parse_json_payload(&request_context, payload)?.0;
let target = build_visual_novel_audio_target(payload, AudioAssetSlot::SoundEffect)?;
publish_generated_audio_asset(
&state,
&request_context,
authenticated.claims().user_id(),
task_id,
parse_json_payload(&request_context, payload)?.0,
AudioAssetSlot::SoundEffect,
target,
)
.await
.map(|payload| {
json_success_body(
Some(&request_context),
contract::VisualNovelGeneratedAudioAssetResponse {
kind: contract::VisualNovelAudioGenerationKind::SoundEffect,
task_id: payload.task_id,
provider: payload.provider,
status: payload.status,
asset_object_id: payload.asset_object_id,
asset_kind: payload.asset_kind,
audio_src: payload.audio_src,
},
)
})
.map_err(|error| error.into_response_with_context(Some(&request_context)))
}
pub async fn publish_background_music_asset(
State(state): State<AppState>,
Path(task_id): Path<String>,
axum::extract::Extension(request_context): axum::extract::Extension<RequestContext>,
axum::extract::Extension(authenticated): axum::extract::Extension<AuthenticatedAccessToken>,
payload: Result<Json<creation_audio::PublishGeneratedAudioAssetRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let payload = parse_json_payload(&request_context, payload)?.0;
let target = build_creation_audio_target(payload)?;
publish_generated_audio_asset(
&state,
authenticated.claims().user_id(),
task_id,
AudioAssetSlot::BackgroundMusic,
target,
)
.await
.map(|payload| json_success_body(Some(&request_context), payload))
.map_err(|error| error.into_response_with_context(Some(&request_context)))
}
pub async fn publish_sound_effect_asset(
State(state): State<AppState>,
Path(task_id): Path<String>,
axum::extract::Extension(request_context): axum::extract::Extension<RequestContext>,
axum::extract::Extension(authenticated): axum::extract::Extension<AuthenticatedAccessToken>,
payload: Result<Json<creation_audio::PublishGeneratedAudioAssetRequest>, JsonRejection>,
) -> Result<Json<Value>, Response> {
let payload = parse_json_payload(&request_context, payload)?.0;
let target = build_creation_audio_target(payload)?;
publish_generated_audio_asset(
&state,
authenticated.claims().user_id(),
task_id,
AudioAssetSlot::SoundEffect,
target,
)
.await
.map(|payload| json_success_body(Some(&request_context), payload))
@@ -240,15 +450,12 @@ pub async fn publish_visual_novel_sound_effect_asset(
async fn publish_generated_audio_asset(
state: &AppState,
_request_context: &RequestContext,
owner_user_id: &str,
task_id: String,
payload: contract::PublishVisualNovelGeneratedAudioAssetRequest,
slot: AudioAssetSlot,
) -> Result<contract::VisualNovelGeneratedAudioAssetResponse, AppError> {
target: AudioAssetBindingTarget,
) -> Result<creation_audio::GeneratedAudioAssetResponse, AppError> {
let task_id = normalize_limited_text(&task_id, "taskId", 160)?;
let scene_id = normalize_limited_text(&payload.scene_id, "sceneId", 160)?;
let profile_id = normalize_optional_text(payload.profile_id.as_deref());
let settings = require_vector_engine_audio_settings(state)?;
let http_client = build_vector_engine_audio_http_client(&settings)?;
let task_payload = fetch_audio_task_payload(&http_client, &settings, slot, &task_id).await?;
@@ -277,8 +484,8 @@ async fn publish_generated_audio_asset(
}
if is_pending_task_status(&status) && audio_urls.is_empty() {
return Ok(contract::VisualNovelGeneratedAudioAssetResponse {
kind: slot.contract_kind(),
return Ok(creation_audio::GeneratedAudioAssetResponse {
kind: slot.creation_contract_kind(),
task_id,
provider: slot.provider().to_string(),
status,
@@ -303,21 +510,20 @@ async fn publish_generated_audio_asset(
state,
&http_client,
owner_user_id,
profile_id,
scene_id,
&task_id,
slot,
target.clone(),
audio,
)
.await?;
Ok(contract::VisualNovelGeneratedAudioAssetResponse {
kind: slot.contract_kind(),
Ok(creation_audio::GeneratedAudioAssetResponse {
kind: slot.creation_contract_kind(),
task_id,
provider: slot.provider().to_string(),
status: "completed".to_string(),
asset_object_id: Some(persisted.asset_object_id),
asset_kind: Some(slot.asset_kind().to_string()),
asset_kind: Some(target.asset_kind),
audio_src: Some(persisted.audio_src),
})
}
@@ -360,10 +566,9 @@ async fn persist_generated_audio_asset(
state: &AppState,
http_client: &reqwest::Client,
owner_user_id: &str,
profile_id: Option<String>,
scene_id: String,
task_id: &str,
slot: AudioAssetSlot,
target: AudioAssetBindingTarget,
audio: DownloadedAudio,
) -> Result<PersistedAudioAsset, AppError> {
let oss_client = state.oss_client().ok_or_else(|| {
@@ -378,20 +583,26 @@ async fn persist_generated_audio_asset(
.put_object(
http_client,
OssPutObjectRequest {
prefix: LegacyAssetPrefix::CustomWorldScenes,
prefix: target.storage_prefix,
path_segments: vec![
"visual-novel".to_string(),
profile_id.clone().unwrap_or_else(|| "draft".to_string()),
scene_id.clone(),
slot.slot().to_string(),
],
target.storage_scope.clone(),
target
.profile_id
.clone()
.unwrap_or_else(|| "draft".to_string()),
target.entity_id.clone(),
target.slot.clone(),
]
.into_iter()
.map(|segment| sanitize_audio_path_segment(segment.as_str(), "audio"))
.collect(),
file_name,
content_type: Some(audio.mime_type.clone()),
access: OssObjectAccess::Private,
metadata: build_audio_asset_metadata(
owner_user_id,
profile_id.as_deref(),
&scene_id,
target.profile_id.as_deref(),
&target,
slot,
),
body: audio.bytes,
@@ -420,11 +631,11 @@ async fn persist_generated_audio_asset(
head.content_type.or(Some(audio.mime_type)),
head.content_length,
head.etag,
slot.asset_kind().to_string(),
target.asset_kind.clone(),
Some(task_id.to_string()),
Some(owner_user_id.to_string()),
profile_id.clone(),
Some(scene_id.clone()),
target.profile_id.clone(),
Some(target.entity_id.clone()),
now_micros,
)
.map_err(map_asset_field_error)?,
@@ -437,12 +648,12 @@ async fn persist_generated_audio_asset(
build_asset_entity_binding_input(
generate_asset_binding_id(now_micros),
asset_object.asset_object_id.clone(),
AUDIO_ENTITY_KIND.to_string(),
scene_id,
slot.slot().to_string(),
slot.asset_kind().to_string(),
target.entity_kind,
target.entity_id,
target.slot,
target.asset_kind,
Some(owner_user_id.to_string()),
profile_id,
target.profile_id,
now_micros,
)
.map_err(map_asset_field_error)?,
@@ -459,15 +670,15 @@ async fn persist_generated_audio_asset(
fn build_audio_asset_metadata(
owner_user_id: &str,
profile_id: Option<&str>,
scene_id: &str,
target: &AudioAssetBindingTarget,
slot: AudioAssetSlot,
) -> BTreeMap<String, String> {
let mut metadata = BTreeMap::from([
("asset-kind".to_string(), slot.asset_kind().to_string()),
("asset-kind".to_string(), target.asset_kind.clone()),
("owner-user-id".to_string(), owner_user_id.to_string()),
("entity-kind".to_string(), AUDIO_ENTITY_KIND.to_string()),
("entity-id".to_string(), scene_id.to_string()),
("slot".to_string(), slot.slot().to_string()),
("entity-kind".to_string(), target.entity_kind.clone()),
("entity-id".to_string(), target.entity_id.clone()),
("slot".to_string(), target.slot.clone()),
("provider".to_string(), slot.provider().to_string()),
]);
if let Some(profile_id) = profile_id {
@@ -476,6 +687,51 @@ fn build_audio_asset_metadata(
metadata
}
fn build_visual_novel_audio_target(
payload: contract::PublishVisualNovelGeneratedAudioAssetRequest,
slot: AudioAssetSlot,
) -> Result<AudioAssetBindingTarget, AppError> {
let entity_id = normalize_limited_text(&payload.scene_id, "sceneId", 160)?;
Ok(AudioAssetBindingTarget {
entity_kind: AUDIO_ENTITY_KIND.to_string(),
entity_id,
slot: slot.slot().to_string(),
asset_kind: slot.asset_kind().to_string(),
profile_id: normalize_optional_text(payload.profile_id.as_deref()),
storage_prefix: LegacyAssetPrefix::CustomWorldScenes,
storage_scope: "visual-novel".to_string(),
})
}
fn build_creation_audio_target(
payload: creation_audio::PublishGeneratedAudioAssetRequest,
) -> Result<AudioAssetBindingTarget, AppError> {
let entity_kind = normalize_limited_text(&payload.entity_kind, "entityKind", 80)?;
let entity_id = normalize_limited_text(&payload.entity_id, "entityId", 160)?;
let slot = normalize_limited_text(&payload.slot, "slot", 80)?;
let asset_kind = normalize_limited_text(&payload.asset_kind, "assetKind", 80)?;
let storage_prefix = match payload.storage_prefix {
Some(creation_audio::CreationAudioStoragePrefix::PuzzleAssets) => {
LegacyAssetPrefix::PuzzleAssets
}
Some(creation_audio::CreationAudioStoragePrefix::Match3DAssets) => {
LegacyAssetPrefix::Match3DAssets
}
Some(creation_audio::CreationAudioStoragePrefix::CustomWorldScenes) | None => {
LegacyAssetPrefix::CustomWorldScenes
}
};
Ok(AudioAssetBindingTarget {
storage_scope: entity_kind.clone(),
entity_kind,
entity_id,
slot,
asset_kind,
profile_id: normalize_optional_text(payload.profile_id.as_deref()),
storage_prefix,
})
}
fn require_vector_engine_audio_settings(
state: &AppState,
) -> Result<VectorEngineAudioSettings, AppError> {
@@ -878,6 +1134,30 @@ fn encode_path_segment(value: &str) -> String {
urlencoding::encode(value).into_owned()
}
fn sanitize_audio_path_segment(raw: &str, fallback: &str) -> String {
let normalized = raw
.trim()
.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
ch.to_ascii_lowercase()
} else {
'-'
}
})
.collect::<String>();
let collapsed = normalized
.split('-')
.filter(|part| !part.is_empty())
.collect::<Vec<_>>()
.join("-");
if collapsed.is_empty() {
fallback.to_string()
} else {
collapsed.chars().take(80).collect()
}
}
fn truncate_raw(raw_text: &str) -> String {
raw_text.chars().take(800).collect()
}