Graphormer实战教程:批量SMILES文件导入+CSV结果导出自动化脚本编写

张开发
2026/4/7 14:27:43 15 分钟阅读

分享文章

Graphormer实战教程:批量SMILES文件导入+CSV结果导出自动化脚本编写
Graphormer实战教程批量SMILES文件导入CSV结果导出自动化脚本编写1. 教程概述Graphormer是一种基于纯Transformer架构的图神经网络专门为分子图原子-键结构的全局结构建模与属性预测而设计。它在OGB、PCQM4M等分子基准测试中表现优异大幅超越了传统GNN模型。本教程将带你从零开始学习如何编写自动化脚本实现以下功能批量导入SMILES格式的分子结构文件使用Graphormer进行分子属性预测将预测结果导出为CSV格式2. 环境准备2.1 基础环境要求确保你的系统满足以下要求Python 3.8PyTorch 2.0RDKit (用于分子处理)至少8GB可用内存推荐使用GPU加速2.2 安装依赖包pip install torch torch-geometric ogb rdkit-pypi gradio pandas2.3 验证Graphormer服务确保Graphormer服务已启动并运行在7860端口curl http://localhost:7860如果看到Gradio界面返回的HTML内容说明服务正常运行。3. 核心脚本编写3.1 批量处理脚本框架我们先创建一个基础脚本框架batch_predict.pyimport requests import pandas as pd from rdkit import Chem from rdkit.Chem import AllChem import time class GraphormerBatchPredictor: def __init__(self, api_urlhttp://localhost:7860): self.api_url api_url def load_smiles_file(self, file_path): 加载SMILES文件 pass def predict_single_molecule(self, smiles): 单个分子预测 pass def batch_predict(self, smiles_list): 批量预测 pass def save_to_csv(self, results, output_file): 保存结果到CSV pass if __name__ __main__: predictor GraphormerBatchPredictor() # 这里添加主程序逻辑3.2 实现SMILES文件加载完善load_smiles_file方法支持多种格式输入def load_smiles_file(self, file_path): 支持.txt和.csv格式的SMILES文件 if file_path.endswith(.txt): with open(file_path, r) as f: return [line.strip() for line in f if line.strip()] elif file_path.endswith(.csv): df pd.read_csv(file_path) if smiles in df.columns: return df[smiles].tolist() else: raise ValueError(CSV文件必须包含smiles列) else: raise ValueError(只支持.txt和.csv格式文件)3.3 实现单个分子预测添加与Graphormer API交互的逻辑def predict_single_molecule(self, smiles, taskproperty-guided): 调用Graphormer API进行预测 try: # 验证SMILES有效性 mol Chem.MolFromSmiles(smiles) if not mol: return {smiles: smiles, error: Invalid SMILES} # 准备请求数据 data { smiles: smiles, task: task } # 发送预测请求 response requests.post( f{self.api_url}/predict, jsondata, timeout60 ) if response.status_code 200: result response.json() return {smiles: smiles, **result} else: return {smiles: smiles, error: fAPI Error: {response.text}} except Exception as e: return {smiles: smiles, error: str(e)}3.4 实现批量预测添加批量处理逻辑包含进度显示和错误处理def batch_predict(self, smiles_list, taskproperty-guided, delay0.5): 批量预测delay参数控制请求间隔 results [] total len(smiles_list) for i, smiles in enumerate(smiles_list, 1): print(f\rProcessing {i}/{total} ({i/total:.1%}), end) result self.predict_single_molecule(smiles, task) results.append(result) time.sleep(delay) # 避免服务器过载 print(\nBatch prediction completed!) return results3.5 实现结果导出完善CSV导出功能def save_to_csv(self, results, output_file): 将预测结果保存为CSV df pd.DataFrame(results) # 展开嵌套的预测结果 if predictions in df.columns: predictions_df pd.json_normalize(df[predictions]) df pd.concat([df.drop(predictions, axis1), predictions_df], axis1) df.to_csv(output_file, indexFalse) print(fResults saved to {output_file})4. 完整脚本整合将上述代码整合成完整的batch_predict.pyimport requests import pandas as pd from rdkit import Chem import time import argparse class GraphormerBatchPredictor: def __init__(self, api_urlhttp://localhost:7860): self.api_url api_url def load_smiles_file(self, file_path): 支持.txt和.csv格式的SMILES文件 if file_path.endswith(.txt): with open(file_path, r) as f: return [line.strip() for line in f if line.strip()] elif file_path.endswith(.csv): df pd.read_csv(file_path) if smiles in df.columns: return df[smiles].tolist() else: raise ValueError(CSV文件必须包含smiles列) else: raise ValueError(只支持.txt和.csv格式文件) def predict_single_molecule(self, smiles, taskproperty-guided): 调用Graphormer API进行预测 try: mol Chem.MolFromSmiles(smiles) if not mol: return {smiles: smiles, error: Invalid SMILES} data {smiles: smiles, task: task} response requests.post( f{self.api_url}/predict, jsondata, timeout60 ) if response.status_code 200: result response.json() return {smiles: smiles, **result} else: return {smiles: smiles, error: fAPI Error: {response.text}} except Exception as e: return {smiles: smiles, error: str(e)} def batch_predict(self, smiles_list, taskproperty-guided, delay0.5): 批量预测 results [] total len(smiles_list) for i, smiles in enumerate(smiles_list, 1): print(f\rProcessing {i}/{total} ({i/total:.1%}), end) result self.predict_single_molecule(smiles, task) results.append(result) time.sleep(delay) print(\nBatch prediction completed!) return results def save_to_csv(self, results, output_file): 将预测结果保存为CSV df pd.DataFrame(results) if predictions in df.columns: predictions_df pd.json_normalize(df[predictions]) df pd.concat([df.drop(predictions, axis1), predictions_df], axis1) df.to_csv(output_file, indexFalse) print(fResults saved to {output_file}) def main(): parser argparse.ArgumentParser(descriptionGraphormer批量预测脚本) parser.add_argument(input_file, help输入SMILES文件(.txt或.csv)) parser.add_argument(output_file, help输出CSV文件) parser.add_argument(--task, defaultproperty-guided, choices[property-guided, catalyst-adsorption], help预测任务类型) parser.add_argument(--delay, typefloat, default0.5, help请求间隔时间(秒)) args parser.parse_args() predictor GraphormerBatchPredictor() try: print(fLoading SMILES from {args.input_file}...) smiles_list predictor.load_smiles_file(args.input_file) print(fStarting batch prediction for {len(smiles_list)} molecules...) results predictor.batch_predict(smiles_list, args.task, args.delay) predictor.save_to_csv(results, args.output_file) except Exception as e: print(fError: {str(e)}) if __name__ __main__: main()5. 使用示例5.1 准备测试数据创建一个test_smiles.txt文件内容如下CCO c1ccccc1 CC(O)O C O CO5.2 运行批量预测python batch_predict.py test_smiles.txt results.csv --task property-guided5.3 查看输出结果生成的results.csv将包含类似以下内容smileserrorpredictions.property1predictions.property2CCO0.751.23c1ccccc10.821.45CC(O)O0.681.12C0.450.89O0.921.67CO0.711.346. 进阶优化6.1 多线程处理为提高处理速度可以添加多线程支持from concurrent.futures import ThreadPoolExecutor, as_completed def batch_predict(self, smiles_list, taskproperty-guided, max_workers4, delay0.1): 多线程批量预测 results [] total len(smiles_list) def process_smiles(smiles): time.sleep(delay) # 控制请求速率 return self.predict_single_molecule(smiles, task) with ThreadPoolExecutor(max_workersmax_workers) as executor: futures {executor.submit(process_smiles, smi): smi for smi in smiles_list} for i, future in enumerate(as_completed(futures), 1): print(f\rProcessing {i}/{total} ({i/total:.1%}), end) results.append(future.result()) print(\nBatch prediction completed!) return results6.2 断点续传添加断点续传功能避免中途失败需要重新开始def batch_predict(self, smiles_list, taskproperty-guided, checkpoint_fileNone): 支持断点续传的批量预测 if checkpoint_file and os.path.exists(checkpoint_file): with open(checkpoint_file, rb) as f: results pickle.load(f) processed {r[smiles] for r in results if smiles in r} else: results [] processed set() total len(smiles_list) remaining [smi for smi in smiles_list if smi not in processed] for i, smiles in enumerate(remaining, len(processed)1): print(f\rProcessing {i}/{total} ({i/total:.1%}), end) result self.predict_single_molecule(smiles, task) results.append(result) if checkpoint_file: with open(checkpoint_file, wb) as f: pickle.dump(results, f) print(\nBatch prediction completed!) return results7. 总结通过本教程我们实现了一个完整的Graphormer批量预测脚本具有以下特点灵活输入支持可以处理.txt和.csv格式的SMILES文件健壮的错误处理能够识别无效SMILES并记录API错误进度可视化实时显示处理进度结构化输出结果保存为易于分析的CSV格式可扩展性支持多线程和断点续传等进阶功能这个脚本可以广泛应用于药物发现中的高通量分子筛选材料科学中的分子特性批量分析化学研究中的大规模分子数据集处理获取更多AI镜像想探索更多AI镜像和应用场景访问 CSDN星图镜像广场提供丰富的预置镜像覆盖大模型推理、图像生成、视频生成、模型微调等多个领域支持一键部署。

更多文章