故障诊断大模型 | 迈向新一代智能维护:大模型与小模型的协同融合
2025/12/18 1:04:22
#!/usr/bin/env python3# -*- coding: utf-8 -*-""" Generate large comma-separated TXT data for single-node text processing. Each line = one record, fields separated by ',' (no quotes). """importargparseimportdatetimeasdtimportgzipimportosimportrandomfromtypingimportIO METHODS=["GET","POST","PUT","DELETE"]PATHS=["/api/login","/api/logout","/api/orders","/api/orders/{id}","/api/users/{id}","/api/products","/api/products/{id}","/api/search","/api/metrics","/health"]STATUS_POOL=([200]*70+[201]*5+[204]*5+[400]*5+[401]*2+[403]*2+[404]*6+[429]*1+[500]*3+[502]*1)UA_IDS=list(range(1,51))# 1..50defopen_text(path:str,gzip_on:bool)->IO[str]:os.makedirs(os.path.dirname(os.path.abspath(path)),exist_ok=True)ifgzip_onorpath.endswith(".gz"):returngzip.open(path,"wt",encoding="utf-8",newline="\n")returnopen(path,"w",encoding="utf-8",newline="\n")defrand_ip(rng:random.Random)->str:r=rng.random()ifr<0.15:returnf"10.{rng.randrange(256)}.{rng.randrange(256)}.{rng.randrange(1,255)}"ifr<0.25:returnf"192.168.{rng.randrange(256)}.{rng.randrange(1,255)}"returnf"{rng.randrange(1,224)}.{rng.randrange(256)}.{rng.randrange(256)}.{rng.randrange(1,255)}"defrand_path(rng:random.Random)->str:p=rng.choice(PATHS)if"{id}"inp:returnp.replace("{id}",str(rng.randrange(1,200_000)))returnpdefrand_trace_id(rng:random.Random,n:int=16)->str:return"".join(rng.choice("0123456789abcdef")for_inrange(n))deflatency_ms(rng:random.Random,status:int)->int:base=int(rng.lognormvariate(3.6,0.55))# 常见 20~200msifstatus>=500:base+=rng.randrange(200,900)elifstatus>=400:base+=rng.randrange(50,300)returnmin(base,5000)defbytes_sent(rng:random.Random,path:str,status:int)->int:ifstatus==204:return0if"/metrics"inpath:returnrng.randrange(5_000,30_000)if"/products"inpathor"/search"inpath:returnrng.randrange(800,12_000)if"/orders"inpath:returnrng.randrange(600,9_000)ifstatus>=400:returnrng.randrange(200,2_000)returnrng.randrange(300,6_000)defgenerate(out:str,lines:int,users:int,start_time:dt.datetime,span_seconds:int,seed:int,gzip_on:bool)->None:rng=random.Random(seed)withopen_text(out,gzip_on)asf:for_inrange(lines):ts=start_time+dt.timedelta(seconds=rng.randrange(span_seconds))ts_str=ts.strftime("%Y-%m-%dT%H:%M:%S")ip=rand_ip(rng)user_id=rng.randrange(1,users+1)method=rng.choice(METHODS)path=rand_path(rng)status=rng.choice(STATUS_POOL)latency=latency_ms(rng,status)size=bytes_sent(rng,path,status)ua_id=rng.choice(UA_IDS)trace_id=rand_trace_id(rng)f.write(f"{ts_str},{ip},{user_id},{method},{path},"f"{status},{latency},{size},{ua_id},{trace_id}\n")defmain():ap=argparse.ArgumentParser()ap.add_argument("--lines",type=int,default=1_000_000)ap.add_argument("--users",type=int,default=200_000)ap.add_argument("--start",type=str,default="2025-12-01T00:00:00")ap.add_argument("--span-seconds",type=int,default=86400)ap.add_argument("--seed",type=int,default=42)ap.add_argument("--out",type=str,default="data/log_1m.txt")ap.add_argument("--gzip",action="store_true")args=ap.parse_args()generate(out=args.out,lines=args.lines,users=args.users,start_time=dt.datetime.fromisoformat(args.start),span_seconds=args.span_seconds,seed=args.seed,gzip_on=args.gzip)if__name__=="__main__":main()每一行字段含义(固定顺序):
ts, ip, user_id, method, path, status, latency_ms, bytes, ua_id, trace_id例如:
2025-12-01T23:16:50,10.140.125.58,36580,GET,/api/metrics,200,39,8070,14,706d7e805da846a3这些是任何实现都应该先完成的。
status >= 400的请求数200 -> 734,921 204 -> 48,203 401 -> 12,881 404 -> 65,102 500 -> 9,871HashMap / Dictionary 的使用
key 数量小,适合优化
GET -> xxx POST -> xxx PUT -> xxx DELETE -> xxx用于验证你是否避免无意义对象创建。
这些是面试/博客最爱问的。
ip计数user_id -> count/api/metrics /api/orders /api/users/{id}/api/users/104594归一化成/api/users/{id}2025-12-01T23:16:50→2025-12-01T23:162025-12-01T23:16 -> 834 2025-12-01T23:17 -> 921LocalDateTime.parsehour -> error_count例如:
03 -> 412 08 -> 1291 17 -> 2387path -> avg(latency_ms)sum / countlatency_ms排序status >= 400 AND GROUP BY path非常贴近真实SRE / APM 分析。
这是真实生产排障模型。
sum(bytes) avg(bytes)ip -> sum(bytes)