SOONet开源大模型实操微调SOONet适配中文query需添加中文CLIP分支1. 项目背景与需求SOONet作为基于自然语言输入的长视频时序片段定位系统在英文视频内容检索方面表现出色。但在实际应用中我们经常需要处理中文查询需求。原生SOONet主要针对英文优化直接处理中文query时效果可能不够理想。为了解决这个问题我们需要对SOONet进行微调添加中文CLIP分支来更好地理解和处理中文查询。这个过程涉及模型架构调整、训练数据处理和微调策略制定。2. 环境准备与依赖安装2.1 基础环境要求# 创建conda环境 conda create -n soonet-zh python3.10 conda activate soonet-zh # 安装PyTorch根据CUDA版本选择 pip install torch1.13.1cu116 torchvision0.14.1cu116 -f https://download.pytorch.org/whl/torch_stable.html # 安装其他核心依赖 pip install modelscope1.0.0 pip install gradio3.34.0 pip install opencv-python4.7.0.72 pip install ftfy6.1.1 pip install regex2022.10.31 pip install numpy1.24.32.2 中文CLIP模型准备# 下载中文CLIP模型 git clone https://github.com/OFA-Sys/Chinese-CLIP.git cd Chinese-CLIP # 安装中文CLIP依赖 pip install -r requirements.txt # 下载预训练权重以ViT-B/32为例 wget https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-b-32.pt3. 模型架构修改3.1 添加中文CLIP分支我们需要修改SOONet的文本编码器部分添加中文CLIP分支import torch import torch.nn as nn from modelscope.models.base import TorchModel from modelscope.utils.constant import Tasks class SOONetWithChineseCLIP(TorchModel): def __init__(self, model_dir, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) # 加载原始SOONet模型 self.soonet self.build_soonet(model_dir) # 加载中文CLIP文本编码器 self.chinese_clip self.build_chinese_clip() # 投影层将中文CLIP特征映射到SOONet空间 self.text_projection nn.Linear(512, 512) # 根据实际维度调整 def build_chinese_clip(self): 加载中文CLIP模型 from cn_clip.clip import load_from_name model, preprocess load_from_name(ViT-B-32, devicecpu) return model def encode_chinese_text(self, text): 中文文本编码 # 中文CLIP的文本预处理 text_inputs self.chinese_clip.preprocess_text(text) with torch.no_grad(): text_features self.chinese_clip.encode_text(text_inputs) # 投影到SOONet特征空间 projected_features self.text_projection(text_features) return projected_features def forward(self, inputs): if isinstance(inputs, tuple): text, video inputs else: text inputs[text] video inputs[video] # 判断文本语言简单启发式规则 if self._is_chinese(text): text_features self.encode_chinese_text(text) else: text_features self.soonet.encode_text(text) video_features self.soonet.encode_video(video) # 后续处理与原始SOONet相同 return self.soonet.temporal_grounding(text_features, video_features) def _is_chinese(self, text): 简单判断是否为中文文本 import re chinese_chars re.findall(r[\u4e00-\u9fff], text) return len(chinese_chars) / max(len(text), 1) 0.33.2 修改配置文件创建新的配置文件configuration_zh.json{ framework: pytorch, task: video-temporal-grounding, model: { type: SOONetWithChineseCLIP, soonet_model_path: /root/ai-models/iic/multi-modal_soonet_video-temporal-grounding/SOONet_MAD_VIT-B-32_4Scale_10C.pth, chinese_clip_path: Chinese-CLIP/clip_cn_vit-b-32.pt }, pipeline: { type: video-temporal-grounding } }4. 数据准备与预处理4.1 准备中英文平行语料为了微调模型我们需要准备视频-文本对数据包含中英文描述import json import os from torch.utils.data import Dataset class BilingualVideoTextDataset(Dataset): def __init__(self, video_dir, annotation_file): self.video_dir video_dir with open(annotation_file, r, encodingutf-8) as f: self.annotations json.load(f) def __len__(self): return len(self.annotations) def __getitem__(self, idx): item self.annotations[idx] video_path os.path.join(self.video_dir, item[video_id] .mp4) en_text item[english_description] zh_text item[chinese_description] timestamps item[timestamps] # [start_sec, end_sec] # 加载视频帧 frames self._load_video_frames(video_path, timestamps) return { frames: frames, en_text: en_text, zh_text: zh_text, timestamps: timestamps } def _load_video_frames(self, video_path, timestamps): 加载指定时间段的视频帧 import cv2 cap cv2.VideoCapture(video_path) start_sec, end_sec timestamps fps cap.get(cv2.CAP_PROP_FPS) start_frame int(start_sec * fps) end_frame int(end_sec * fps) frames [] cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) for i in range(start_frame, end_frame 1): ret, frame cap.read() if ret: frame cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame cv2.resize(frame, (224, 224)) frames.append(frame) cap.release() return frames4.2 创建数据加载器from torch.utils.data import DataLoader from torchvision import transforms def create_data_loader(dataset, batch_size4, shuffleTrue): transform transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean[0.485, 0.456, 0.406], std[0.229, 0.224, 0.225]) ]) def collate_fn(batch): # 自定义批处理函数 processed_batch [] for item in batch: # 应用变换到每一帧 frames torch.stack([transform(frame) for frame in item[frames]]) processed_batch.append({ frames: frames, en_text: item[en_text], zh_text: item[zh_text], timestamps: item[timestamps] }) return processed_batch return DataLoader( dataset, batch_sizebatch_size, shuffleshuffle, collate_fncollate_fn, num_workers4 )5. 微调训练策略5.1 损失函数设计我们需要设计适合双语言训练的损失函数import torch.nn.functional as F class BilingualLoss(nn.Module): def __init__(self, alpha0.5): super().__init__() self.alpha alpha # 中文损失的权重 def forward(self, en_outputs, zh_outputs, targets): en_outputs: 英文分支的输出 zh_outputs: 中文分支的输出 targets: 真实时间戳 # 英文分支损失 en_loss F.smooth_l1_loss(en_outputs, targets) # 中文分支损失 zh_loss F.smooth_l1_loss(zh_outputs, targets) # 一致性损失确保两个分支输出相似 consistency_loss F.mse_loss(en_outputs, zh_outputs) total_loss (1 - self.alpha) * en_loss self.alpha * zh_loss 0.1 * consistency_loss return total_loss5.2 训练循环实现def train_bilingual_soonet(model, train_loader, val_loader, num_epochs10): device torch.device(cuda if torch.cuda.is_available() else cpu) model model.to(device) optimizer torch.optim.AdamW([ {params: model.soonet.parameters(), lr: 1e-5}, {params: model.chinese_clip.parameters(), lr: 5e-6}, {params: model.text_projection.parameters(), lr: 1e-4} ]) criterion BilingualLoss(alpha0.7) # 更注重中文分支 for epoch in range(num_epochs): model.train() total_loss 0 for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() # 处理英文输入 en_outputs model((batch[en_text], batch[frames])) # 处理中文输入 zh_outputs model((batch[zh_text], batch[frames])) # 计算损失 targets torch.tensor(batch[timestamps]).to(device) loss criterion(en_outputs, zh_outputs, targets) loss.backward() optimizer.step() total_loss loss.item() if batch_idx % 100 0: print(fEpoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}) # 验证阶段 model.eval() val_loss validate(model, val_loader, criterion, device) print(fEpoch {epoch} completed. Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}) # 保存检查点 if epoch % 5 0: torch.save({ epoch: epoch, model_state_dict: model.state_dict(), optimizer_state_dict: optimizer.state_dict(), loss: total_loss/len(train_loader) }, fsoonet_zh_checkpoint_epoch_{epoch}.pth)6. 推理与部署6.1 修改推理pipelinefrom modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks def create_bilingual_pipeline(model_dir): 创建支持中英文的SOONet pipeline # 修改默认的pipeline以支持中文 class BilingualVideoTemporalGroundingPipeline: def __init__(self, model_dir): self.model SOONetWithChineseCLIP(model_dir) self.model.eval() def __call__(self, inputs): if isinstance(inputs, tuple): text, video_path inputs else: text inputs[text] video_path inputs[video] # 加载视频 video_frames self._load_video(video_path) with torch.no_grad(): result self.model((text, video_frames)) return { scores: result[scores].tolist(), timestamps: result[timestamps].tolist() } def _load_video(self, video_path): 加载视频并提取帧 import cv2 cap cv2.VideoCapture(video_path) frames [] while True: ret, frame cap.read() if not ret: break frame cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame cv2.resize(frame, (224, 224)) frames.append(frame) cap.release() return frames return BilingualVideoTemporalGroundingPipeline(model_dir) # 使用示例 bilingual_pipeline create_bilingual_pipeline( /path/to/modified/soonet ) # 中文查询 zh_result bilingual_pipeline((一个人从冰箱里拿出食物, video.mp4)) print(f中文查询结果: {zh_result}) # 英文查询 en_result bilingual_pipeline((a man takes food out of the refrigerator, video.mp4)) print(f英文查询结果: {en_result})6.2 Gradio Web界面适配import gradio as gr def create_bilingual_interface(): 创建支持中英文的Web界面 pipeline create_bilingual_pipeline(MODEL_DIR) def process_video(text, video_file): # 保存上传的视频 video_path f/tmp/{video_file.name} with open(video_path, wb) as f: f.write(video_file.read()) # 执行推理 result pipeline((text, video_path)) # 格式化结果 output f查询: {text}\n\n for i, (score, (start, end)) in enumerate(zip(result[scores], result[timestamps])): output f片段 {i1}: {start:.1f}s - {end:.1f}s (置信度: {score:.3f})\n return output # 创建界面 with gr.Blocks(titleSOONet 中英文视频时序定位) as demo: gr.Markdown(# SOONet 中英文视频时序定位系统) gr.Markdown(支持中文和英文查询输入自然语言描述来定位视频中的相关片段) with gr.Row(): with gr.Column(): text_input gr.Textbox( label查询文本, placeholder输入中文或英文描述如一个人从冰箱里拿出食物 ) video_input gr.Video(label上传视频) submit_btn gr.Button(开始定位) with gr.Column(): output_text gr.Textbox(label定位结果, lines10) # 示例 gr.Examples( examples[ [一个人从冰箱里拿出食物, example_video.mp4], [a man takes food out of the refrigerator, example_video.mp4], [两个人在公园里散步, example_video2.mp4] ], inputs[text_input, video_input] ) submit_btn.click( fnprocess_video, inputs[text_input, video_input], outputsoutput_text ) return demo # 启动服务 if __name__ __main__: demo create_bilingual_interface() demo.launch(server_name0.0.0.0, server_port7860)7. 效果评估与优化7.1 评估指标为了评估中英文版本的效果我们需要建立评估数据集def evaluate_bilingual_model(model, test_dataset): 评估双语模型性能 results { chinese: {precision: [], recall: [], f1: []}, english: {precision: [], recall: [], f1: []} } model.eval() with torch.no_grad(): for item in test_dataset: # 中文查询评估 zh_pred model((item[zh_text], item[frames])) zh_metrics calculate_metrics(zh_pred, item[timestamps]) # 英文查询评估 en_pred model((item[en_text], item[frames])) en_metrics calculate_metrics(en_pred, item[timestamps]) # 记录结果 for metric in [precision, recall, f1]: results[chinese][metric].append(zh_metrics[metric]) results[english][metric].append(en_metrics[metric]) # 计算平均指标 avg_results {} for lang in [chinese, english]: avg_results[lang] { metric: np.mean(results[lang][metric]) for metric in [precision, recall, f1] } return avg_results def calculate_metrics(predicted, ground_truth, iou_threshold0.5): 计算时序定位的评估指标 # 计算IoU iou temporal_iou(predicted, ground_truth) precision 1.0 if iou iou_threshold else 0.0 recall 1.0 if iou iou_threshold else 0.0 f1 2 * precision * recall / (precision recall 1e-8) return {precision: precision, recall: recall, f1: f1} def temporal_iou(pred_interval, gt_interval): 计算两个时间区间的IoU start_pred, end_pred pred_interval start_gt, end_gt gt_interval intersection_start max(start_pred, start_gt) intersection_end min(end_pred, end_gt) if intersection_start intersection_end: return 0.0 intersection intersection_end - intersection_start union (end_pred - start_pred) (end_gt - start_gt) - intersection return intersection / union7.2 持续优化策略基于评估结果我们可以进一步优化模型def optimize_model_based_on_evaluation(model, eval_results, train_loader): 根据评估结果优化模型 # 分析中英文性能差异 zh_f1 eval_results[chinese][f1] en_f1 eval_results[english][f1] if zh_f1 en_f1 * 0.8: # 中文性能明显较差 print(中文性能需要优化增加中文数据权重) # 调整损失函数权重 criterion BilingualLoss(alpha0.8) # 更注重中文 # 针对性训练 fine_tune_chinese_branch(model, train_loader, criterion) elif en_f1 zh_f1 * 0.8: # 英文性能明显较差 print(英文性能需要优化确保英文分支不受影响) # 冻结中文分支只训练英文分支 freeze_chinese_branch(model) fine_tune_english_branch(model, train_loader) else: print(中英文性能均衡进行整体微调) fine_tune_both_branches(model, train_loader) def fine_tune_chinese_branch(model, train_loader, criterion, num_epochs3): 针对性优化中文分支 optimizer torch.optim.AdamW([ {params: model.chinese_clip.parameters(), lr: 1e-5}, {params: model.text_projection.parameters(), lr: 2e-4} ]) for epoch in range(num_epochs): model.train() for batch in train_loader: optimizer.zero_grad() # 只使用中文数据 zh_outputs model((batch[zh_text], batch[frames])) targets torch.tensor(batch[timestamps]).to(device) loss F.smooth_l1_loss(zh_outputs, targets) loss.backward() optimizer.step()8. 总结与展望通过添加中文CLIP分支和对SOONet进行双语微调我们成功扩展了模型的中文处理能力。这种方法的关键在于架构设计在保持原有英文能力的基础上添加中文分支数据策略使用中英文平行语料进行联合训练损失设计平衡中英文分支的学习确保一致性渐进优化基于评估结果针对性优化弱项实际部署中这种双语支持大大提升了SOONet在中文环境的实用性。用户现在可以用自然的中文描述来定位视频内容而不需要翻译成英文。未来的优化方向包括支持更多语言分支改进语言自动检测机制优化多语言间的知识迁移减少模型参数量和计算开销这种方法不仅适用于SOONet也可以为其他多模态模型的多语言适配提供参考。获取更多AI镜像想探索更多AI镜像和应用场景访问 CSDN星图镜像广场提供丰富的预置镜像覆盖大模型推理、图像生成、视频生成、模型微调等多个领域支持一键部署。