diff --git a/03_nf4_dequant/ayepei/bnb_benchmark_results.csv b/03_nf4_dequant/ayepei/bnb_benchmark_results.csv new file mode 100644 index 0000000..27a93e2 --- /dev/null +++ b/03_nf4_dequant/ayepei/bnb_benchmark_results.csv @@ -0,0 +1,21 @@ +Shape,Blocksize,BnB Time (ms),Total Elements,Weight File,BnB Output File +256x256,64,0.1144,65536,weight_data/weight_256x256_bs64.bin,bnb_results/bnb_256x256_bs64.fp16 +256x256,128,0.1064,65536,weight_data/weight_256x256_bs128.bin,bnb_results/bnb_256x256_bs128.fp16 +512x512,64,0.1010,262144,weight_data/weight_512x512_bs64.bin,bnb_results/bnb_512x512_bs64.fp16 +512x512,128,0.0977,262144,weight_data/weight_512x512_bs128.bin,bnb_results/bnb_512x512_bs128.fp16 +1024x1024,64,0.0960,1048576,weight_data/weight_1024x1024_bs64.bin,bnb_results/bnb_1024x1024_bs64.fp16 +1024x1024,128,0.0912,1048576,weight_data/weight_1024x1024_bs128.bin,bnb_results/bnb_1024x1024_bs128.fp16 +2048x2048,64,0.0910,4194304,weight_data/weight_2048x2048_bs64.bin,bnb_results/bnb_2048x2048_bs64.fp16 +2048x2048,128,0.0925,4194304,weight_data/weight_2048x2048_bs128.bin,bnb_results/bnb_2048x2048_bs128.fp16 +4096x4096,64,0.0891,16777216,weight_data/weight_4096x4096_bs64.bin,bnb_results/bnb_4096x4096_bs64.fp16 +4096x4096,128,0.0905,16777216,weight_data/weight_4096x4096_bs128.bin,bnb_results/bnb_4096x4096_bs128.fp16 +8192x8192,64,0.0918,67108864,weight_data/weight_8192x8192_bs64.bin,bnb_results/bnb_8192x8192_bs64.fp16 +8192x8192,128,0.0912,67108864,weight_data/weight_8192x8192_bs128.bin,bnb_results/bnb_8192x8192_bs128.fp16 +16384x16384,64,0.3074,268435456,weight_data/weight_16384x16384_bs64.bin,bnb_results/bnb_16384x16384_bs64.fp16 +16384x16384,128,0.2973,268435456,weight_data/weight_16384x16384_bs128.bin,bnb_results/bnb_16384x16384_bs128.fp16 +3421x3146,64,0.0925,10762466,weight_data/weight_3421x3146_bs64.bin,bnb_results/bnb_3421x3146_bs64.fp16 +3421x3146,128,0.0904,10762466,weight_data/weight_3421x3146_bs128.bin,bnb_results/bnb_3421x3146_bs128.fp16 +6578x1236,64,0.0895,8130408,weight_data/weight_6578x1236_bs64.bin,bnb_results/bnb_6578x1236_bs64.fp16 +6578x1236,128,0.0876,8130408,weight_data/weight_6578x1236_bs128.bin,bnb_results/bnb_6578x1236_bs128.fp16 +7000x7000,64,0.0909,49000000,weight_data/weight_7000x7000_bs64.bin,bnb_results/bnb_7000x7000_bs64.fp16 +7000x7000,128,0.0939,49000000,weight_data/weight_7000x7000_bs128.bin,bnb_results/bnb_7000x7000_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log new file mode 100644 index 0000000..02821c0 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs128.bin +rows=1024 +cols=1024 +blocksize=128 +total_elements=1048576 +kernel_time_ms=0.0030 +bandwidth_gbps=816.30 +output_file=cuda_results/dequant_1024x1024_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log new file mode 100644 index 0000000..d849144 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs64.bin +rows=1024 +cols=1024 +blocksize=64 +total_elements=1048576 +kernel_time_ms=0.0029 +bandwidth_gbps=845.08 +output_file=cuda_results/dequant_1024x1024_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log new file mode 100644 index 0000000..be9a2b9 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs128.bin +rows=16384 +cols=16384 +blocksize=128 +total_elements=268435456 +kernel_time_ms=0.2102 +bandwidth_gbps=2991.95 +output_file=cuda_results/dequant_16384x16384_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log new file mode 100644 index 0000000..ecf536e --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs64.bin +rows=16384 +cols=16384 +blocksize=64 +total_elements=268435456 +kernel_time_ms=0.2105 +bandwidth_gbps=2987.26 +output_file=cuda_results/dequant_16384x16384_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log new file mode 100644 index 0000000..d5977b1 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs128.bin +rows=2048 +cols=2048 +blocksize=128 +total_elements=4194304 +kernel_time_ms=0.0049 +bandwidth_gbps=2017.74 +output_file=cuda_results/dequant_2048x2048_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log new file mode 100644 index 0000000..77d3f61 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs64.bin +rows=2048 +cols=2048 +blocksize=64 +total_elements=4194304 +kernel_time_ms=0.0047 +bandwidth_gbps=2082.46 +output_file=cuda_results/dequant_2048x2048_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log new file mode 100644 index 0000000..7d15411 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs128.bin +rows=256 +cols=256 +blocksize=128 +total_elements=65536 +kernel_time_ms=0.0026 +bandwidth_gbps=58.80 +output_file=cuda_results/dequant_256x256_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log new file mode 100644 index 0000000..1b9f8fc --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs64.bin +rows=256 +cols=256 +blocksize=64 +total_elements=65536 +kernel_time_ms=0.0026 +bandwidth_gbps=58.80 +output_file=cuda_results/dequant_256x256_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log new file mode 100644 index 0000000..a4519d0 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs128.bin +rows=3421 +cols=3146 +blocksize=128 +total_elements=10762466 +kernel_time_ms=0.0086 +bandwidth_gbps=2939.31 +output_file=cuda_results/dequant_3421x3146_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log new file mode 100644 index 0000000..63b378d --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs64.bin +rows=3421 +cols=3146 +blocksize=64 +total_elements=10762466 +kernel_time_ms=0.0086 +bandwidth_gbps=2935.13 +output_file=cuda_results/dequant_3421x3146_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log new file mode 100644 index 0000000..7740d29 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs128.bin +rows=4096 +cols=4096 +blocksize=128 +total_elements=16777216 +kernel_time_ms=0.0139 +bandwidth_gbps=2832.19 +output_file=cuda_results/dequant_4096x4096_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log new file mode 100644 index 0000000..2901aa9 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs64.bin +rows=4096 +cols=4096 +blocksize=64 +total_elements=16777216 +kernel_time_ms=0.0138 +bandwidth_gbps=2840.68 +output_file=cuda_results/dequant_4096x4096_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log new file mode 100644 index 0000000..80257fa --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs128.bin +rows=512 +cols=512 +blocksize=128 +total_elements=262144 +kernel_time_ms=0.0028 +bandwidth_gbps=221.24 +output_file=cuda_results/dequant_512x512_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log new file mode 100644 index 0000000..f92896f --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs64.bin +rows=512 +cols=512 +blocksize=64 +total_elements=262144 +kernel_time_ms=0.0028 +bandwidth_gbps=221.82 +output_file=cuda_results/dequant_512x512_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log new file mode 100644 index 0000000..d234416 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs128.bin +rows=6578 +cols=1236 +blocksize=128 +total_elements=8130408 +kernel_time_ms=0.0070 +bandwidth_gbps=2717.80 +output_file=cuda_results/dequant_6578x1236_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log new file mode 100644 index 0000000..9a36bd8 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs64.bin +rows=6578 +cols=1236 +blocksize=64 +total_elements=8130408 +kernel_time_ms=0.0072 +bandwidth_gbps=2638.75 +output_file=cuda_results/dequant_6578x1236_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log new file mode 100644 index 0000000..5599e37 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs128.bin +rows=7000 +cols=7000 +blocksize=128 +total_elements=49000000 +kernel_time_ms=0.0412 +bandwidth_gbps=2787.30 +output_file=cuda_results/dequant_7000x7000_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log new file mode 100644 index 0000000..2c36614 --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs64.bin +rows=7000 +cols=7000 +blocksize=64 +total_elements=49000000 +kernel_time_ms=0.0414 +bandwidth_gbps=2771.04 +output_file=cuda_results/dequant_7000x7000_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log new file mode 100644 index 0000000..128224f --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs128.bin +rows=8192 +cols=8192 +blocksize=128 +total_elements=67108864 +kernel_time_ms=0.0552 +bandwidth_gbps=2850.08 +output_file=cuda_results/dequant_8192x8192_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log new file mode 100644 index 0000000..cc998ca --- /dev/null +++ b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs64.bin +rows=8192 +cols=8192 +blocksize=64 +total_elements=67108864 +kernel_time_ms=42.5944 +bandwidth_gbps=3.69 +output_file=cuda_results/dequant_8192x8192_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py b/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py new file mode 100644 index 0000000..2ef8f59 --- /dev/null +++ b/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py @@ -0,0 +1,284 @@ +import torch +import bitsandbytes.functional as F +import numpy as np +import struct +import os +import csv +from pathlib import Path + +# 保存权重数据到文件 +def save_weight_data(filename, rows, cols, blocksize, + packed, absmax_q, absmax2, code2, offset): + """ + 保存权重数据到二进制文件 + 格式: + [header] + num_rows: int64 + num_cols: int64 + blocksize: int32 + [data] + packed_weights: uint8[num_rows * num_cols / 2] + absmax_q: uint8[num_blocks] + absmax2: float16[num_groups] + code2: float16[256] + offset: float32 + """ + with open(filename, 'wb') as f: + # 写入 header + f.write(struct.pack('q', rows)) # num_rows (int64) + f.write(struct.pack('q', cols)) # num_cols (int64) + f.write(struct.pack('i', blocksize)) # blocksize (int32) + + # 写入 packed_weights (uint8) + packed_np = packed.cpu().numpy().astype(np.uint8) + f.write(packed_np.tobytes()) + + # 写入 absmax_q (uint8) + absmax_q_np = absmax_q.cpu().numpy().astype(np.uint8) + f.write(absmax_q_np.tobytes()) + + # 写入 absmax2 (float16) + absmax2_np = absmax2.cpu().numpy().astype(np.float16) + f.write(absmax2_np.tobytes()) + + # 写入 code2 (float16[256]) - 确保长度为256 + code2_np = code2.cpu().numpy().astype(np.float16) + if len(code2_np) < 256: + code2_padded = np.zeros(256, dtype=np.float16) + code2_padded[:len(code2_np)] = code2_np + else: + code2_padded = code2_np[:256] + f.write(code2_padded.tobytes()) + + # 写入 offset (float32) + f.write(struct.pack('f', offset)) + + file_size = os.path.getsize(filename) + print(f" 权重文件已保存: {filename} ({file_size/1024:.2f} KB)") + +# 保存 bitsandbytes 的解量化结果 +def save_bnb_output(filename, output_tensor, rows, cols): + """ + 保存 bitsandbytes 的解量化结果 + 格式:float16 二进制文件,按行主序存储 + """ + output_np = output_tensor.cpu().numpy().astype(np.float16) + output_np.tofile(filename) + file_size = os.path.getsize(filename) + print(f" BnB 结果已保存: {filename} ({file_size/1024:.2f} KB)") + +# ========================================================= +# 运行 bitsandbytes 解量化并计时 +# ========================================================= +def run_bnb_dequant(packed, state, test_iters=100): + """ + 运行 bitsandbytes 解量化,返回结果和平均执行时间 + """ + print(f" 运行 bitsandbytes 解量化...") + + # 预热 + for _ in range(10): + out_ref = F.dequantize_4bit(packed, state) + + torch.cuda.synchronize() + + # 计时 + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(test_iters): + out_ref = F.dequantize_4bit(packed, state) + end.record() + + torch.cuda.synchronize() + bnb_time = start.elapsed_time(end) / test_iters + + # 最后一次结果用于保存 + final_output = F.dequantize_4bit(packed, state) + + print(f" BnB 完成, 平均时间: {bnb_time:.4f} ms") + + return final_output, bnb_time + +# ========================================================= +# 生成并保存单个测试用例 +# ========================================================= +def generate_and_test(rows, cols, blocksize, group_size=256, + save_dir="weight_data", bnb_dir="bnb_results"): + """ + 生成一个测试用例,运行 bitsandbytes,保存所有需要的数据 + """ + total = rows * cols + + print(f"\n 处理 {rows}x{cols} 矩阵, blocksize={blocksize}...") + + # 创建保存目录 + Path(save_dir).mkdir(exist_ok=True) + Path(bnb_dir).mkdir(exist_ok=True) + + # 创建权重数据(在 GPU 上) + weight = torch.randn(rows, cols, device="cuda", dtype=torch.float16) + + # 量化 + packed, state = F.quantize_4bit( + weight, + blocksize=blocksize, + quant_type="nf4", + compress_statistics=True + ) + + # 获取量化参数 + absmax_q = state.absmax.contiguous() + absmax2 = state.state2.absmax.to(torch.float16).contiguous() + code2 = state.state2.code.to(torch.float16).contiguous() + offset = float(state.offset) + + # 保存权重文件(供 CUDA 程序读取) + weight_file = f"{save_dir}/weight_{rows}x{cols}_bs{blocksize}.bin" + save_weight_data(weight_file, rows, cols, blocksize, + packed, absmax_q, absmax2, code2, offset) + + # 运行 bitsandbytes 并计时 + bnb_output, bnb_time = run_bnb_dequant(packed, state) + + # 保存 bitsandbytes 的解量化结果 + bnb_file = f"{bnb_dir}/bnb_{rows}x{cols}_bs{blocksize}.fp16" + save_bnb_output(bnb_file, bnb_output, rows, cols) + + + + return { + 'shape': f"{rows}x{cols}", + 'rows': rows, + 'cols': cols, + 'blocksize': blocksize, + 'weight_file': weight_file, + 'bnb_file': bnb_file, + 'bnb_time_ms': bnb_time, + 'total_elements': total + } + + +# 生成所有测试用例并保存结果 +def generate_all(): + """ + 生成所有测试用例,运行 bitsandbytes,保存结果 + """ + shapes = [ + (256, 256), + (512, 512), + (1024, 1024), + (2048, 2048), + (4096, 4096), + (8192, 8192), + (16384, 16384), + (3421, 3146), + (6578, 1236), + (7000, 7000), + ] + + blocksizes = [64, 128] + + save_dir = "weight_data" + bnb_dir = "bnb_results" + + print("=" * 70) + print("NF4 数据生成和 bitsandbytes 基准测试") + print("=" * 70) + print(f"权重文件保存目录: {save_dir}/") + print(f"BnB 结果保存目录: {bnb_dir}/") + print() + + results = [] + + for rows, cols in shapes: + for block in blocksizes: + info = generate_and_test(rows, cols, block, + save_dir=save_dir, bnb_dir=bnb_dir) + results.append(info) + + # 保存汇总结果到 CSV + csv_file = "bnb_benchmark_results.csv" + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['Shape', 'Blocksize', 'BnB Time (ms)', + 'Total Elements', 'Weight File', 'BnB Output File']) + + for r in results: + writer.writerow([ + r['shape'], + r['blocksize'], + f"{r['bnb_time_ms']:.4f}", + r['total_elements'], + r['weight_file'], + r['bnb_file'] + ]) + + print("\n" + "=" * 70) + print(" 生成完成!") + print("=" * 70) + print(f"\n BnB 基准测试结果已保存到: {csv_file}") + print("\n生成的目录结构:") + print(f" {save_dir}/ - 包含以下权重文件(供 CUDA 程序使用):") + for r in results: + print(f" - weight_{r['shape']}_bs{r['blocksize']}.bin") + + print(f"\n {bnb_dir}/ - 包含以下结果文件(用于对比):") + for r in results: + print(f" - bnb_{r['shape']}_bs{r['blocksize']}.fp16") + print(f" - original_{r['shape']}_bs{r['blocksize']}.fp16") + + print("\n" + "=" * 70) + print("BnB 执行时间汇总:") + print("-" * 70) + print(f"{'Shape':<12} {'Block':<6} {'Time (ms)':<12}") + print("-" * 70) + for r in results: + print(f"{r['shape']:<12} {r['blocksize']:<6} {r['bnb_time_ms']:<12.4f}") + print("=" * 70) + +# 生成单个测试用例 +def generate_single(rows=1024, cols=1024, blocksize=64): + """ + 生成单个测试用例 + """ + save_dir = "weight_data" + bnb_dir = "bnb_results" + Path(save_dir).mkdir(exist_ok=True) + Path(bnb_dir).mkdir(exist_ok=True) + + info = generate_and_test(rows, cols, blocksize, + save_dir=save_dir, bnb_dir=bnb_dir) + + print("\n" + "=" * 50) + print("单个测试用例生成完成") + print("=" * 50) + print(f"权重文件: {info['weight_file']}") + print(f"BnB 结果: {info['bnb_file']}") + print(f"原始权重: {info['original_file']}") + print(f"BnB 执行时间: {info['bnb_time_ms']:.4f} ms") + + return info + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='生成 NF4 测试数据并运行 bitsandbytes') + parser.add_argument('--mode', type=str, default='all', + choices=['all', 'single'], + help='运行模式: all (所有配置) 或 single (单个配置)') + parser.add_argument('--rows', type=int, default=1024, + help='矩阵行数 (single 模式)') + parser.add_argument('--cols', type=int, default=1024, + help='矩阵列数 (single 模式)') + parser.add_argument('--blocksize', type=int, default=64, + help='块大小 (single 模式)') + + args = parser.parse_args() + + if args.mode == 'all': + generate_all() + else: + generate_single(args.rows, args.cols, args.blocksize) \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/img/v3_source_1.png b/03_nf4_dequant/ayepei/img/v3_source_1.png new file mode 100644 index 0000000..29a92f7 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_source_1.png differ diff --git a/03_nf4_dequant/ayepei/img/v3_source_2.png b/03_nf4_dequant/ayepei/img/v3_source_2.png new file mode 100644 index 0000000..db56e95 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_source_2.png differ diff --git a/03_nf4_dequant/ayepei/img/v3_th.png b/03_nf4_dequant/ayepei/img/v3_th.png new file mode 100644 index 0000000..4b5039d Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_th.png differ diff --git a/03_nf4_dequant/ayepei/img/v3_warp.png b/03_nf4_dequant/ayepei/img/v3_warp.png new file mode 100644 index 0000000..527cba0 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_warp.png differ diff --git a/03_nf4_dequant/ayepei/img/v4_bank.png b/03_nf4_dequant/ayepei/img/v4_bank.png new file mode 100644 index 0000000..b1c8fea Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_bank.png differ diff --git a/03_nf4_dequant/ayepei/img/v4_source_1.png b/03_nf4_dequant/ayepei/img/v4_source_1.png new file mode 100644 index 0000000..4201351 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_source_1.png differ diff --git a/03_nf4_dequant/ayepei/img/v4_th.png b/03_nf4_dequant/ayepei/img/v4_th.png new file mode 100644 index 0000000..e22c26e Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_th.png differ diff --git a/03_nf4_dequant/ayepei/img/v5_source_1.png b/03_nf4_dequant/ayepei/img/v5_source_1.png new file mode 100644 index 0000000..7ea6123 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_source_1.png differ diff --git a/03_nf4_dequant/ayepei/img/v5_source_2.png b/03_nf4_dequant/ayepei/img/v5_source_2.png new file mode 100644 index 0000000..ed549f2 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_source_2.png differ diff --git a/03_nf4_dequant/ayepei/img/v5_th.png b/03_nf4_dequant/ayepei/img/v5_th.png new file mode 100644 index 0000000..3516548 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_th.png differ diff --git a/03_nf4_dequant/ayepei/img/v5_warp.png b/03_nf4_dequant/ayepei/img/v5_warp.png new file mode 100644 index 0000000..1677ffa Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_warp.png differ diff --git a/03_nf4_dequant/ayepei/img/v6_source_1.png b/03_nf4_dequant/ayepei/img/v6_source_1.png new file mode 100644 index 0000000..1694d0c Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_source_1.png differ diff --git a/03_nf4_dequant/ayepei/img/v6_source_2.png b/03_nf4_dequant/ayepei/img/v6_source_2.png new file mode 100644 index 0000000..07a9668 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_source_2.png differ diff --git a/03_nf4_dequant/ayepei/img/v6_th.png b/03_nf4_dequant/ayepei/img/v6_th.png new file mode 100644 index 0000000..70c3fa0 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_th.png differ diff --git a/03_nf4_dequant/ayepei/img/v6_warp.png b/03_nf4_dequant/ayepei/img/v6_warp.png new file mode 100644 index 0000000..ae761df Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_warp.png differ diff --git a/03_nf4_dequant/ayepei/img/v7_bank.png b/03_nf4_dequant/ayepei/img/v7_bank.png new file mode 100644 index 0000000..99766d6 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_bank.png differ diff --git a/03_nf4_dequant/ayepei/img/v7_source_1.png b/03_nf4_dequant/ayepei/img/v7_source_1.png new file mode 100644 index 0000000..0420b27 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_source_1.png differ diff --git a/03_nf4_dequant/ayepei/img/v7_source_2.png b/03_nf4_dequant/ayepei/img/v7_source_2.png new file mode 100644 index 0000000..a245189 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_source_2.png differ diff --git a/03_nf4_dequant/ayepei/img/v7_th.png b/03_nf4_dequant/ayepei/img/v7_th.png new file mode 100644 index 0000000..e1f4e1a Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_th.png differ diff --git a/03_nf4_dequant/ayepei/img/v7_warp.png b/03_nf4_dequant/ayepei/img/v7_warp.png new file mode 100644 index 0000000..b7942a6 Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_warp.png differ diff --git a/03_nf4_dequant/ayepei/mexc/compare_results.py b/03_nf4_dequant/ayepei/mexc/compare_results.py new file mode 100644 index 0000000..91f39ed --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/compare_results.py @@ -0,0 +1,148 @@ +import torch +import numpy as np +import pandas as pd +import csv +from pathlib import Path + + +def compare_all(): + """对比所有 musa 和 BnB 的结果""" + + BASE = Path("..") + + MUSA_DIR = BASE / "musa_results" + BNB_DIR = BASE / "bnb_results" + BNB_CSV = BASE / "bnb_benchmark_results.csv" + + results = [] + + # 读取 BnB benchmark CSV + if not BNB_CSV.exists(): + print(f" 找不到 {BNB_CSV}") + return + + bnb_data = {} + with open(BNB_CSV, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + key = f"{row['Shape']}_bs{row['Blocksize']}" + bnb_data[key] = { + 'time_ms': float(row['BnB Time (ms)']), + 'bnb_file': row['BnB Output File'] + } + + # ================================================== + # 查找所有 MUSA 结果 + # ================================================== + musa_files = list(MUSA_DIR.glob("dequant_*.fp16")) + + print("\n" + "="*80) + print("对比结果汇总") + print("="*80) + print(f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'MUSA (ms)':<12} " + f"{'Speedup':<8} {'MAE':<12} {'Max Diff':<12}") + print("-"*80) + + for musa_file in musa_files: + filename = musa_file.name + + parts = filename.replace('dequant_', '').replace('.fp16', '').split('_bs') + shape = parts[0] + blocksize = parts[1] + + # 读取 MUSA 输出 + musa_data = np.fromfile(musa_file, dtype=np.float16) + musa_tensor = torch.from_numpy(musa_data) + + # 读取 BnB 输出 + bnb_file = BNB_DIR / f"bnb_{shape}_bs{blocksize}.fp16" + if not bnb_file.exists(): + print(f" 找不到 BnB 文件: {bnb_file}") + continue + + bnb_data_np = np.fromfile(bnb_file, dtype=np.float16) + bnb_tensor = torch.from_numpy(bnb_data_np) + + + + # 误差计算 + mae = torch.mean(torch.abs(bnb_tensor - musa_tensor)).item() + mse = torch.mean((bnb_tensor - musa_tensor) ** 2).item() + max_diff = torch.max(torch.abs(bnb_tensor - musa_tensor)).item() + + # 时间读取 + key = f"{shape}_bs{blocksize}" + bnb_time = bnb_data[key]['time_ms'] + + log_file = MUSA_DIR / f"perf_{shape}_bs{blocksize}.log" + musa_time = 0 + + if log_file.exists(): + with open(log_file, 'r') as f: + for line in f: + if 'kernel_time_ms' in line: + musa_time = float(line.strip().split('=')[1]) + + speedup = bnb_time / musa_time if musa_time > 0 else 0 + + print(f"{shape:<12} {blocksize:<6} " + f"{bnb_time:<12.4f} {musa_time:<12.4f} " + f"{speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}") + + results.append({ + 'shape': shape, + 'blocksize': int(blocksize), + 'bnb_time_ms': bnb_time, + 'musa_time_ms': musa_time, + 'speedup': speedup, + 'mae': mae, + 'mse': mse, + 'max_diff': max_diff, + }) + + print("="*80) + + + # 保存 CSV + df = pd.DataFrame(results) + out_csv = "comparison_musa_results.csv" + float_format = '%.8f' + df.to_csv(out_csv, index=False, float_format=float_format) + + print(f"\n 对比结果已保存到: {out_csv}") + md_file = "comparison_musa_results.md" + with open(md_file, 'w', encoding='utf-8') as f: + # 写入表格标题 + f.write("# MUSA vs BnB 对比结果\n\n") + + # 写入表头 + f.write("| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | Max Diff |\n") + f.write("|-------|-------|----------|-----------|---------|-----|----------|\n") + + # 写入数据 + for _, row in df.iterrows(): + line = (f"| {row['shape']} | {row['blocksize']} | " + f"{row['bnb_time_ms']:.4f} | {row['musa_time_ms']:.4f} | " + f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |") + f.write(line + '\n') + + # 添加详细数据表 + f.write("\n\n## 详细数据\n\n") + f.write("| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff |\n") + f.write("|-------|-------|----------|-----------|---------|-----|-----|----------|\n") + + for _, row in df.iterrows(): + line = (f"| {row['shape']} | {row['blocksize']} | " + f"{row['bnb_time_ms']:.4f} | {row['musa_time_ms']:.4f} | " + f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | " + f"{row['max_diff']:.8f} | ") + f.write(line + '\n') + + print(f" Markdown表格已保存到: {md_file}") + + return df + + + +if __name__ == "__main__": + compare_all() \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv new file mode 100644 index 0000000..c05f623 --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv @@ -0,0 +1,21 @@ +shape,blocksize,bnb_time_ms,musa_time_ms,speedup,mae,mse,max_diff +2048x2048,128,0.09250000,0.04350000,2.12643678,0.00029397,0.00000030,0.00390625 +512x512,64,0.10100000,0.00800000,12.62500000,0.00018466,0.00000018,0.00390625 +256x256,64,0.11440000,0.00740000,15.45945946,0.00022900,0.00000024,0.00390625 +3421x3146,64,0.09250000,0.11680000,0.79195205,0.00017142,0.00000012,0.00390625 +7000x7000,64,0.09090000,0.55810000,0.16287404,0.00024140,0.00000024,0.00390625 +7000x7000,128,0.09390000,0.53600000,0.17518657,0.00025988,0.00000024,0.00390625 +2048x2048,64,0.09100000,0.04560000,1.99561404,0.00022352,0.00000018,0.00390625 +6578x1236,128,0.08760000,0.08320000,1.05288462,0.00026894,0.00000024,0.00390625 +1024x1024,128,0.09120000,0.01370000,6.65693431,0.00024605,0.00000024,0.00390625 +256x256,128,0.10640000,0.00770000,13.81818182,0.00024271,0.00000024,0.00390625 +8192x8192,64,0.09180000,0.77260000,0.11881957,0.00017440,0.00000012,0.00390625 +4096x4096,64,0.08910000,0.18320000,0.48635371,0.00018179,0.00000018,0.00390625 +8192x8192,128,0.09120000,0.74250000,0.12282828,0.00022542,0.00000018,0.00390625 +1024x1024,64,0.09600000,0.01450000,6.62068966,0.00028801,0.00000030,0.00390625 +6578x1236,64,0.08950000,0.08670000,1.03229527,0.00022066,0.00000018,0.00390625 +16384x16384,64,0.30740000,3.30830000,0.09291781,0.00019693,0.00000018,0.00390625 +512x512,128,0.09770000,0.00780000,12.52564103,0.00029063,0.00000030,0.00390625 +4096x4096,128,0.09050000,0.17560000,0.51537585,0.00020349,0.00000018,0.00390625 +3421x3146,128,0.09040000,0.11030000,0.81958296,0.00026965,0.00000024,0.00390625 +16384x16384,128,0.29730000,3.18830000,0.09324719,0.00022352,0.00000018,0.00390625 diff --git a/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md new file mode 100644 index 0000000..7eef9bd --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md @@ -0,0 +1,50 @@ +# MUSA vs BnB 对比结果 + +| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00390625 | + + +## 详细数据 + +| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff | +|-------|-------|----------|-----------|---------|-----|-----|----------| +| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00000030 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00000018 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00000024 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00000012 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00000024 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00000024 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00000018 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00000024 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00000024 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00000024 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00000012 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00000018 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00000018 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00000030 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00000018 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00000018 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00000030 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00000018 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00000024 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00000018 | 0.00390625 | diff --git a/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu b/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu new file mode 100644 index 0000000..684d960 --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu @@ -0,0 +1,372 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MUSA_CHECK(call) \ + do { \ + musaError_t err = call; \ + if (err != musaSuccess) { \ + fprintf(stderr, "MUSA error at %s:%d - %s\n", __FILE__, __LINE__, \ + musaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// 将表放入到常量内存加快速度 +__constant__ __half NF4_LUT_HALF[16]; +__constant__ __half CODE2_LUT[256]; + +void ensure_directory_exists(const char *path) { + struct stat st = {0}; + if (stat(path, &st) == -1) { +#ifdef _WIN32 + _mkdir(path); +#else + mkdir(path, 0755); +#endif + } +} + +// 初始化 LUT 将 float 转换为 half +void init_nf4_lut() { + float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f, + -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f}; + + __half lut_h[16]; + for (int i = 0; i < 16; i++) { + lut_h[i] = __float2half(lut_f[i]); + } + MUSA_CHECK(musaMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h))); +} + + +__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, float offset, + int64_t total_elements, int blocksize, + int group_size, __half *__restrict__ output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + __shared__ __half s_nf4[16]; + + if(threadIdx.x<16){ + s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x]; + } + if (byte_idx >= total_bytes) + return; + uint32_t pack4 = ((const uint32_t *)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = + __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]), + __float2half(offset)); + __half h[8]; + h[0] = __hmul(s_nf4[(b0) >> 4], scale); + h[1] = __hmul(s_nf4[(b0) & 0xF], scale); + h[2] = __hmul(s_nf4[(b1) >> 4], scale); + h[3] = __hmul(s_nf4[(b1) & 0xF], scale); + h[4] = __hmul(s_nf4[(b2) >> 4], scale); + h[5] = __hmul(s_nf4[(b2) & 0xF], scale); + h[6] = __hmul(s_nf4[(b3) >> 4], scale); + h[7] = __hmul(s_nf4[(b3) & 0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half *>(&out_pack)[0] = h[0]; + reinterpret_cast<__half *>(&out_pack)[1] = h[1]; + reinterpret_cast<__half *>(&out_pack)[2] = h[2]; + reinterpret_cast<__half *>(&out_pack)[3] = h[3]; + reinterpret_cast<__half *>(&out_pack)[4] = h[4]; + reinterpret_cast<__half *>(&out_pack)[5] = h[5]; + reinterpret_cast<__half *>(&out_pack)[6] = h[6]; + reinterpret_cast<__half *>(&out_pack)[7] = h[7]; + + ((uint4 *)(output + half_base))[0] = out_pack; +} +// 读取权重文件 +int read_weight_file(const char *filename, int64_t *rows, int64_t *cols, + int *blocksize, uint8_t **packed, uint8_t **absmax_q, + __half **absmax2, __half **code2, float *offset) { + + FILE *fp = fopen(filename, "rb"); + if (!fp) { + fprintf(stderr, "无法打开文件: %s\n", filename); + return -1; + } + + // 读取 header + fread(rows, sizeof(int64_t), 1, fp); + fread(cols, sizeof(int64_t), 1, fp); + fread(blocksize, sizeof(int32_t), 1, fp); + + int64_t total_elements = (*rows) * (*cols); + int64_t num_packed = (total_elements + 1) / 2; + int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + printf("\n 文件信息:\n"); + printf(" 矩阵: %ld x %ld\n", *rows, *cols); + printf(" 总元素数: %ld\n", total_elements); + printf(" blocksize: %d\n", *blocksize); + printf(" 打包数据大小: %ld bytes\n", num_packed); + printf(" 量化块数: %ld\n", num_blocks); + printf(" 分组数: %ld\n", num_groups); + + // 分配内存 + *packed = (uint8_t *)malloc(num_packed); + *absmax_q = (uint8_t *)malloc(num_blocks); + *absmax2 = (__half *)malloc(num_groups * sizeof(__half)); + *code2 = (__half *)malloc(256 * sizeof(__half)); + + if (!*packed || !*absmax_q || !*absmax2 || !*code2) { + fprintf(stderr, " 主机内存分配失败\n"); + fclose(fp); + return -1; + } + + // 读取数据 + fread(*packed, 1, num_packed, fp); + fread(*absmax_q, 1, num_blocks, fp); + fread(*absmax2, sizeof(__half), num_groups, fp); + fread(*code2, sizeof(__half), 256, fp); + fread(offset, sizeof(float), 1, fp); + + fclose(fp); + printf(" 文件读取成功\n"); + return 0; +} + +// 保存解量化后的权重(自动保存到musa_results目录) +void save_dequantized_weight(const char *filename, __half *weight, + int64_t total_elements) { + ensure_directory_exists("../musa_results"); + + // 构建完整路径 + char full_path[512]; + snprintf(full_path, sizeof(full_path), "../musa_results/%s", filename); + + FILE *fp = fopen(full_path, "wb"); + if (!fp) { + fprintf(stderr, " 无法创建输出文件: %s\n", full_path); + return; + } + + fwrite(weight, sizeof(__half), total_elements, fp); + fclose(fp); + + printf(" 已保存解量化结果: %s (%.2f MB)\n", full_path, + (total_elements * sizeof(__half)) / (1024.0 * 1024.0)); +} + +// 计时器 (毫秒) +double get_time_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; +} + + +// 计算有效内存带宽 +double calculate_bandwidth(int64_t total_elements, double time_ms) { + // 输入数据大小 + int64_t input_bytes = (total_elements + 1) / 2; // packed + input_bytes += (total_elements + 64 - 1) / 64; // absmax_q (近似) + input_bytes += + ((total_elements + 64 - 1) / 64 + 255) / 256 * sizeof(__half); // absmax2 + input_bytes += 256 * sizeof(__half); // code2 + + // 输出数据大小 + int64_t output_bytes = total_elements * sizeof(__half); + + int64_t total_bytes = input_bytes + output_bytes; + + return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0); +} + + +int main(int argc, char **argv) { + + if (argc != 2) { + printf("\n使用方法: %s <权重文件.bin>\n", argv[0]); + printf(" 权重文件格式: 由 Python 脚本生成的 .bin 文件\n"); + printf(" 示例: %s weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]); + return -1; + } + + const char *input_file = argv[1]; + + // 确保输出目录存在 + ensure_directory_exists("../musa_results"); + + // 初始化 LUT + init_nf4_lut(); + + // 读取权重文件 + printf("\n 读取权重文件: %s\n", input_file); + int64_t rows, cols; + int blocksize; + uint8_t *h_packed, *h_absmax_q; + __half *h_absmax2, *h_code2; + float offset; + + if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed, + &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) { + return -1; + } + + int64_t total_elements = rows * cols; + int64_t num_units = (total_elements + 1) / 2; // 每个 uint8 包含两个 half + + // 计算 GPU 内存大小 + int64_t num_blocks = (total_elements + blocksize - 1) / blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + printf("\n 计算参数:\n"); + printf(" total_elements: %ld\n", total_elements); + printf(" num_units: %ld\n", num_units); + printf(" num_blocks: %ld\n", num_blocks); + printf(" num_groups: %ld\n", num_groups); + + // 分配 GPU 内存 + printf("\n 分配 MUSA 内存...\n"); + uint8_t *d_packed, *d_absmax_q; + __half *d_absmax2, *d_code2, *d_output; + + MUSA_CHECK(musaMalloc(&d_packed, num_units)); + MUSA_CHECK(musaMalloc(&d_absmax_q, num_blocks)); + MUSA_CHECK(musaMalloc(&d_absmax2, num_groups * sizeof(__half))); + MUSA_CHECK(musaMalloc(&d_code2, 256 * sizeof(__half))); + MUSA_CHECK(musaMalloc(&d_output, total_elements * sizeof(__half))); + + // 拷贝数据到 GPU + printf(" 拷贝数据到 MUSA 设备...\n"); + MUSA_CHECK(musaMemcpy(d_packed, h_packed, num_units, musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_absmax_q, h_absmax_q, num_blocks, musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half), + musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_code2, h_code2, 256 * sizeof(__half), + musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half))); + + // 分配主机输出内存 + __half *h_output = (__half *)malloc(total_elements * sizeof(__half)); + if (!h_output) { + fprintf(stderr, " 主机输出内存分配失败\n"); + return -1; + } + + // 配置内核启动参数 + int threads = 256; + int64_t total_bytes = total_elements >> 1; + int blocks = (total_bytes/4 + threads - 1) / threads; + + printf("\n 内核配置:\n"); + printf(" blocks: %d\n", blocks); + printf(" threads per block: %d\n", threads); + printf(" 总线程数: %d\n", blocks * threads); + + // 预热 (5次) + printf("\n 预热 (5次)...\n"); + for (int i = 0; i < 5; i++) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, total_elements, blocksize, + 256, d_output); + } + MUSA_CHECK(musaDeviceSynchronize()); + + // 正式测试 (100次) + printf(" 性能测试 (100次迭代)...\n"); + double start_time = get_time_ms(); + + for (int i = 0; i < 100; i++) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, total_elements, blocksize, + 256, d_output); + } + + MUSA_CHECK(musaDeviceSynchronize()); + double end_time = get_time_ms(); + + double total_time = end_time - start_time; + double avg_time_ms = total_time / 100.0; + + // 计算带宽 + double bandwidth = calculate_bandwidth(total_elements, avg_time_ms); + + // 拷贝结果回主机 + printf(" 拷贝结果回主机...\n"); + MUSA_CHECK(musaMemcpy(h_output, d_output, total_elements * sizeof(__half), + musaMemcpyDeviceToHost)); + + // 生成输出文件名 + char output_file[256]; + snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows, + cols, blocksize); + + // 保存解量化结果(自动保存到musa_results目录) + printf("\n 保存解量化结果...\n"); + save_dequantized_weight(output_file, h_output, total_elements); + + // 生成性能日志文件名 + char log_file[256]; + snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols, + blocksize); + + // 输出性能结果 + printf("输入文件: %s\n", input_file); + printf("矩阵大小: %ld x %ld\n", rows, cols); + printf("总元素数: %ld\n", total_elements); + printf("数据大小: %.2f MB\n", + total_elements * sizeof(__half) / (1024.0 * 1024.0)); + printf("\n"); + printf("核函数执行时间: %.4f ms\n", avg_time_ms); + printf("有效内存带宽: %.2f GB/s\n", bandwidth); + printf("\n"); + printf("输出文件: musa_results/%s\n", output_file); + printf("日志文件: musa_results/%s\n", log_file); + + // 保存性能日志(也保存到musa_results目录) + char log_path[512]; + snprintf(log_path, sizeof(log_path), "../musa_results/%s", log_file); + + FILE *log_fp = fopen(log_path, "w"); + if (log_fp) { + fprintf(log_fp, "input_file=%s\n", input_file); + fprintf(log_fp, "rows=%ld\n", rows); + fprintf(log_fp, "cols=%ld\n", cols); + fprintf(log_fp, "blocksize=%d\n", blocksize); + fprintf(log_fp, "total_elements=%ld\n", total_elements); + fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms); + fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth); + fprintf(log_fp, "output_file=musa_results/%s\n", output_file); + fclose(log_fp); + printf(" 性能日志已保存到: %s\n", log_path); + } + + free(h_packed); + free(h_absmax_q); + free(h_absmax2); + free(h_code2); + free(h_output); + + musaFree(d_packed); + musaFree(d_absmax_q); + musaFree(d_absmax2); + musaFree(d_code2); + musaFree(d_output); + + printf("\n 测试完成!\n\n"); + return 0; +} + diff --git a/03_nf4_dequant/ayepei/mexc/nf4_musa b/03_nf4_dequant/ayepei/mexc/nf4_musa new file mode 100644 index 0000000..8224952 Binary files /dev/null and b/03_nf4_dequant/ayepei/mexc/nf4_musa differ diff --git a/03_nf4_dequant/ayepei/mexc/run.md b/03_nf4_dequant/ayepei/mexc/run.md new file mode 100644 index 0000000..80df44e --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/run.md @@ -0,0 +1,6 @@ +$MUSA_ROOT/bin/mcc -O3 -std=c++17 \ +-I$MUSA_ROOT/include \ +-L$MUSA_ROOT/lib \ +-L/usr/lib/gcc/x86_64-linux-gnu/11 \ +-lmusart -lstdc++ \ +nf4_dequant_musa.mu -o nf4_musa \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/mexc/test.sbatch b/03_nf4_dequant/ayepei/mexc/test.sbatch new file mode 100644 index 0000000..f60a0c0 --- /dev/null +++ b/03_nf4_dequant/ayepei/mexc/test.sbatch @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name=test_job # 任务名 +#SBATCH --output=output_%j.log # 标准输出文件(%j 会替换成 job ID) +#SBATCH --error=error_%j.log # 标准错误输出文件 +#SBATCH --partition=mt # 分区名(机器系统默认分区是 mt) +#SBATCH --nodes=1 # 需要的节点数 +#SBATCH --ntasks=1 # 总任务数(通常 = 节点数 × 每节点任务数) +#SBATCH --cpus-per-task=16 # 每个任务需要的 CPU 核心数 +#SBATCH --gres=gpu:mt:1 # 请求 4 块 GPU(mt 是 Gres 类型) +#SBATCH --mem=256G # 请求的内存 +#SBATCH --time=00:20:00 # 运行时间上限 + + + + + +for f in ../weight_data/*.bin; do + srun ./nf4_musa "$f" +done \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log new file mode 100644 index 0000000..e8fc29b --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs128.bin +rows=1024 +cols=1024 +blocksize=128 +total_elements=1048576 +kernel_time_ms=0.0139 +bandwidth_gbps=176.89 +output_file=musa_results/dequant_1024x1024_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log new file mode 100644 index 0000000..ceed4e9 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs64.bin +rows=1024 +cols=1024 +blocksize=64 +total_elements=1048576 +kernel_time_ms=0.0148 +bandwidth_gbps=166.28 +output_file=musa_results/dequant_1024x1024_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log new file mode 100644 index 0000000..9aaaf21 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs128.bin +rows=16384 +cols=16384 +blocksize=128 +total_elements=268435456 +kernel_time_ms=3.1883 +bandwidth_gbps=197.26 +output_file=musa_results/dequant_16384x16384_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log new file mode 100644 index 0000000..b64822d --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs64.bin +rows=16384 +cols=16384 +blocksize=64 +total_elements=268435456 +kernel_time_ms=3.3084 +bandwidth_gbps=190.10 +output_file=musa_results/dequant_16384x16384_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log new file mode 100644 index 0000000..75659f4 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs128.bin +rows=2048 +cols=2048 +blocksize=128 +total_elements=4194304 +kernel_time_ms=0.0431 +bandwidth_gbps=227.91 +output_file=musa_results/dequant_2048x2048_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log new file mode 100644 index 0000000..a861daf --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs64.bin +rows=2048 +cols=2048 +blocksize=64 +total_elements=4194304 +kernel_time_ms=0.0453 +bandwidth_gbps=216.90 +output_file=musa_results/dequant_2048x2048_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log new file mode 100644 index 0000000..cec8720 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs128.bin +rows=256 +cols=256 +blocksize=128 +total_elements=65536 +kernel_time_ms=0.0070 +bandwidth_gbps=22.07 +output_file=musa_results/dequant_256x256_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log new file mode 100644 index 0000000..e3d9d34 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs64.bin +rows=256 +cols=256 +blocksize=64 +total_elements=65536 +kernel_time_ms=0.0075 +bandwidth_gbps=20.62 +output_file=musa_results/dequant_256x256_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log new file mode 100644 index 0000000..083f631 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs128.bin +rows=3421 +cols=3146 +blocksize=128 +total_elements=10762466 +kernel_time_ms=0.1103 +bandwidth_gbps=228.70 +output_file=musa_results/dequant_3421x3146_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log new file mode 100644 index 0000000..9202688 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs64.bin +rows=3421 +cols=3146 +blocksize=64 +total_elements=10762466 +kernel_time_ms=0.1152 +bandwidth_gbps=218.97 +output_file=musa_results/dequant_3421x3146_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log new file mode 100644 index 0000000..8b10a1b --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs128.bin +rows=4096 +cols=4096 +blocksize=128 +total_elements=16777216 +kernel_time_ms=0.1756 +bandwidth_gbps=223.87 +output_file=musa_results/dequant_4096x4096_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log new file mode 100644 index 0000000..d67c5f7 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs64.bin +rows=4096 +cols=4096 +blocksize=64 +total_elements=16777216 +kernel_time_ms=0.1832 +bandwidth_gbps=214.56 +output_file=musa_results/dequant_4096x4096_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log new file mode 100644 index 0000000..21ab078 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs128.bin +rows=512 +cols=512 +blocksize=128 +total_elements=262144 +kernel_time_ms=0.0079 +bandwidth_gbps=77.61 +output_file=musa_results/dequant_512x512_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log new file mode 100644 index 0000000..ea20645 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs64.bin +rows=512 +cols=512 +blocksize=64 +total_elements=262144 +kernel_time_ms=0.0075 +bandwidth_gbps=82.06 +output_file=musa_results/dequant_512x512_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log new file mode 100644 index 0000000..2b2ead1 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs128.bin +rows=6578 +cols=1236 +blocksize=128 +total_elements=8130408 +kernel_time_ms=0.0831 +bandwidth_gbps=229.21 +output_file=musa_results/dequant_6578x1236_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log new file mode 100644 index 0000000..26988f1 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs64.bin +rows=6578 +cols=1236 +blocksize=64 +total_elements=8130408 +kernel_time_ms=0.0867 +bandwidth_gbps=219.75 +output_file=musa_results/dequant_6578x1236_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log new file mode 100644 index 0000000..e99804c --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs128.bin +rows=7000 +cols=7000 +blocksize=128 +total_elements=49000000 +kernel_time_ms=0.5361 +bandwidth_gbps=214.14 +output_file=musa_results/dequant_7000x7000_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log new file mode 100644 index 0000000..c4b39d5 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs64.bin +rows=7000 +cols=7000 +blocksize=64 +total_elements=49000000 +kernel_time_ms=0.5581 +bandwidth_gbps=205.69 +output_file=musa_results/dequant_7000x7000_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log new file mode 100644 index 0000000..2dbf46c --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs128.bin +rows=8192 +cols=8192 +blocksize=128 +total_elements=67108864 +kernel_time_ms=0.7425 +bandwidth_gbps=211.76 +output_file=musa_results/dequant_8192x8192_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log new file mode 100644 index 0000000..ef7e058 --- /dev/null +++ b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs64.bin +rows=8192 +cols=8192 +blocksize=64 +total_elements=67108864 +kernel_time_ms=0.7726 +bandwidth_gbps=203.52 +output_file=musa_results/dequant_8192x8192_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx/compare_results.py b/03_nf4_dequant/ayepei/mx/compare_results.py new file mode 100644 index 0000000..340efb3 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/compare_results.py @@ -0,0 +1,131 @@ +import csv +from pathlib import Path + +import numpy as np +import pandas as pd +import torch + + +def compare_all(): + """对比所有 mx 和 BnB 的结果""" + + base = Path("..") + mx_dir = base / "mx_results" + bnb_dir = base / "bnb_results" + bnb_csv = base / "bnb_benchmark_results.csv" + + results = [] + + if not bnb_csv.exists(): + print(f"找不到 {bnb_csv}") + return + + bnb_data = {} + with open(bnb_csv, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + key = f"{row['Shape']}_bs{row['Blocksize']}" + bnb_data[key] = { + "time_ms": float(row["BnB Time (ms)"]), + "bnb_file": row["BnB Output File"], + } + + mx_files = list(mx_dir.glob("dequant_*.fp16")) + + print("\n" + "=" * 90) + print("对比结果汇总") + print("=" * 90) + print( + f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'MX (ms)':<12} {'Speedup':<8} {'MAE':<12} {'Max Diff':<12}" + ) + print("-" * 90) + + for mx_file in mx_files: + filename = mx_file.name + shape, blocksize = ( + filename.replace("dequant_", "").replace(".fp16", "").split("_bs") + ) + + mx_tensor = torch.from_numpy(np.fromfile(mx_file, dtype=np.float16)) + + bnb_file = bnb_dir / f"bnb_{shape}_bs{blocksize}.fp16" + if not bnb_file.exists(): + print(f"找不到 BnB 文件: {bnb_file}") + continue + + bnb_tensor = torch.from_numpy(np.fromfile(bnb_file, dtype=np.float16)) + + + + mae = torch.mean(torch.abs(bnb_tensor - mx_tensor)).item() + mse = torch.mean((bnb_tensor - mx_tensor) ** 2).item() + max_diff = torch.max(torch.abs(bnb_tensor - mx_tensor)).item() + + key = f"{shape}_bs{blocksize}" + bnb_time = bnb_data.get(key, {}).get("time_ms", 0.0) + + log_file = mx_dir / f"perf_{shape}_bs{blocksize}.log" + mx_time = 0.0 + if log_file.exists(): + with open(log_file, "r", encoding="utf-8") as f: + for line in f: + if "kernel_time_ms" in line: + mx_time = float(line.strip().split("=")[1]) + + speedup = bnb_time / mx_time if mx_time > 0 else 0.0 + + print( + f"{shape:<12} {blocksize:<6} {bnb_time:<12.4f} {mx_time:<12.4f} {speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}" + ) + + results.append( + { + "shape": shape, + "blocksize": int(blocksize), + "bnb_time_ms": bnb_time, + "mx_time_ms": mx_time, + "speedup": speedup, + "mae": mae, + "mse": mse, + "max_diff": max_diff, + } + ) + + print("=" * 90) + + if not results: + print("没有可用的对比结果。") + return + + df = pd.DataFrame(results) + out_csv = "comparison_mx_results.csv" + df.to_csv(out_csv, index=False, float_format="%.8f") + print(f"\n对比结果已保存到: {out_csv}") + + md_file = "comparison_mx_results.md" + with open(md_file, "w", encoding="utf-8") as f: + f.write("# MX vs BnB 对比结果\n\n") + f.write("| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff |\n") + f.write("|-------|-------|----------|---------|---------|-----|----------|\n") + for _, row in df.iterrows(): + f.write( + f"| {row['shape']} | {row['blocksize']} | {row['bnb_time_ms']:.4f} | {row['mx_time_ms']:.4f} | {row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |\n" + ) + + f.write("\n\n## 详细数据\n\n") + f.write( + "| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | MSE | Max Diff |\n" + ) + f.write( + "|-------|-------|----------|---------|---------|-----|-----|----------|\n" + ) + for _, row in df.iterrows(): + f.write( + f"| {row['shape']} | {row['blocksize']} | {row['bnb_time_ms']:.4f} | {row['mx_time_ms']:.4f} | {row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | {row['max_diff']:.8f} |\n" + ) + + print(f"Markdown 表格已保存到: {md_file}") + + +if __name__ == "__main__": + compare_all() diff --git a/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv b/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv new file mode 100644 index 0000000..74301be --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv @@ -0,0 +1,21 @@ +shape,blocksize,bnb_time_ms,mx_time_ms,speedup,mae,mse,max_diff +1024x1024,128,0.09120000,0.02480000,3.67741935,0.00024605,0.00000024,0.00390625 +1024x1024,64,0.09600000,0.01690000,5.68047337,0.00028801,0.00000030,0.00390625 +16384x16384,128,0.29730000,0.92820000,0.32029735,0.00022352,0.00000018,0.00390625 +16384x16384,64,0.30740000,1.01060000,0.30417574,0.00019693,0.00000018,0.00390625 +2048x2048,128,0.09250000,0.03250000,2.84615385,0.00029397,0.00000030,0.00390625 +2048x2048,64,0.09100000,0.02810000,3.23843416,0.00022352,0.00000018,0.00390625 +256x256,128,0.10640000,0.00930000,11.44086022,0.00024271,0.00000024,0.00390625 +256x256,64,0.11440000,0.00940000,12.17021277,0.00022900,0.00000024,0.00390625 +3421x3146,128,0.09040000,0.05090000,1.77603143,0.00026965,0.00000024,0.00390625 +3421x3146,64,0.09250000,0.05170000,1.78916828,0.00017142,0.00000012,0.00390625 +4096x4096,128,0.09050000,0.08130000,1.11316113,0.00020349,0.00000018,0.00390625 +4096x4096,64,0.08910000,0.07350000,1.21224490,0.00018179,0.00000018,0.00390625 +512x512,128,0.09770000,0.01540000,6.34415584,0.00029063,0.00000030,0.00390625 +512x512,64,0.10100000,0.01540000,6.55844156,0.00018466,0.00000018,0.00390625 +6578x1236,128,0.08760000,0.04020000,2.17910448,0.00026894,0.00000024,0.00390625 +6578x1236,64,0.08950000,0.04850000,1.84536082,0.00022066,0.00000018,0.00390625 +7000x7000,128,0.09390000,0.18810000,0.49920255,0.00025988,0.00000024,0.00390625 +7000x7000,64,0.09090000,0.19050000,0.47716535,0.00024140,0.00000024,0.00390625 +8192x8192,128,0.09120000,0.23580000,0.38676845,0.00022542,0.00000018,0.00390625 +8192x8192,64,0.09180000,0.26250000,0.34971429,0.00017440,0.00000012,0.00390625 diff --git a/03_nf4_dequant/ayepei/mx/comparison_mx_results.md b/03_nf4_dequant/ayepei/mx/comparison_mx_results.md new file mode 100644 index 0000000..6e8680a --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/comparison_mx_results.md @@ -0,0 +1,50 @@ +# MX vs BnB 对比结果 + +| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|---------|---------|-----|----------| +| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00390625 | + + +## 详细数据 + +| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | MSE | Max Diff | +|-------|-------|----------|---------|---------|-----|-----|----------| +| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00000024 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00000030 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00000018 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00000018 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00000030 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00000018 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00000024 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00000024 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00000024 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00000012 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00000018 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00000018 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00000030 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00000018 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00000024 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00000018 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00000024 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00000024 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00000018 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00000012 | 0.00390625 | diff --git a/03_nf4_dequant/ayepei/mx/nf4_dequant_mx b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx new file mode 100755 index 0000000..0522008 Binary files /dev/null and b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx differ diff --git a/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca new file mode 100644 index 0000000..bdacfa8 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca @@ -0,0 +1,366 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define GPU_CHECK(call) \ + do { \ + mcError_t err = call; \ + if (err != mcSuccess) { \ + fprintf(stderr, "GPU error at %s:%d - %s\n", __FILE__, __LINE__, \ + mcGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +__constant__ __half NF4_LUT_HALF[16]; +__constant__ __half CODE2_LUT[256]; + +bool read_exact(FILE *fp, void *dst, size_t elem_size, size_t count) { + return fread(dst, elem_size, count, fp) == count; +} + +void ensure_directory_exists(const char *path) { + struct stat st = {0}; + if (stat(path, &st) == -1) { +#ifdef _WIN32 + mkdir(path); +#else + mkdir(path, 0755); +#endif + } +} + +void init_nf4_lut() { + float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f, + -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f}; + + __half lut_h[16]; + for (int i = 0; i < 16; ++i) { + lut_h[i] = __float2half(lut_f[i]); + } + GPU_CHECK(mcMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h))); +} + +__global__ void nf4_dequant_v6(const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, float offset, + int64_t total_elements, int blocksize, + int group_size, __half *__restrict__ output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + if (byte_idx >= total_bytes) + return; + uint32_t pack4 = ((const uint32_t *)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = + __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]), + __float2half(offset)); + __half h[8]; + h[0] = __hmul(NF4_LUT_HALF[(b0) >> 4], scale); + h[1] = __hmul(NF4_LUT_HALF[(b0) & 0xF], scale); + h[2] = __hmul(NF4_LUT_HALF[(b1) >> 4], scale); + h[3] = __hmul(NF4_LUT_HALF[(b1) & 0xF], scale); + h[4] = __hmul(NF4_LUT_HALF[(b2) >> 4], scale); + h[5] = __hmul(NF4_LUT_HALF[(b2) & 0xF], scale); + h[6] = __hmul(NF4_LUT_HALF[(b3) >> 4], scale); + h[7] = __hmul(NF4_LUT_HALF[(b3) & 0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half *>(&out_pack)[0] = h[0]; + reinterpret_cast<__half *>(&out_pack)[1] = h[1]; + reinterpret_cast<__half *>(&out_pack)[2] = h[2]; + reinterpret_cast<__half *>(&out_pack)[3] = h[3]; + reinterpret_cast<__half *>(&out_pack)[4] = h[4]; + reinterpret_cast<__half *>(&out_pack)[5] = h[5]; + reinterpret_cast<__half *>(&out_pack)[6] = h[6]; + reinterpret_cast<__half *>(&out_pack)[7] = h[7]; + + ((uint4 *)(output + half_base))[0] = out_pack; +} +__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, float offset, + int64_t total_elements, int blocksize, + int group_size, __half *__restrict__ output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + __shared__ __half s_nf4[16]; + + if(threadIdx.x<16){ + s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x]; + } + if (byte_idx >= total_bytes) + return; + uint32_t pack4 = ((const uint32_t *)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = + __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]), + __float2half(offset)); + __half h[8]; + h[0] = __hmul(s_nf4[(b0) >> 4], scale); + h[1] = __hmul(s_nf4[(b0) & 0xF], scale); + h[2] = __hmul(s_nf4[(b1) >> 4], scale); + h[3] = __hmul(s_nf4[(b1) & 0xF], scale); + h[4] = __hmul(s_nf4[(b2) >> 4], scale); + h[5] = __hmul(s_nf4[(b2) & 0xF], scale); + h[6] = __hmul(s_nf4[(b3) >> 4], scale); + h[7] = __hmul(s_nf4[(b3) & 0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half *>(&out_pack)[0] = h[0]; + reinterpret_cast<__half *>(&out_pack)[1] = h[1]; + reinterpret_cast<__half *>(&out_pack)[2] = h[2]; + reinterpret_cast<__half *>(&out_pack)[3] = h[3]; + reinterpret_cast<__half *>(&out_pack)[4] = h[4]; + reinterpret_cast<__half *>(&out_pack)[5] = h[5]; + reinterpret_cast<__half *>(&out_pack)[6] = h[6]; + reinterpret_cast<__half *>(&out_pack)[7] = h[7]; + + ((uint4 *)(output + half_base))[0] = out_pack; +} +int read_weight_file(const char *filename, int64_t *rows, int64_t *cols, + int *blocksize, uint8_t **packed, uint8_t **absmax_q, + __half **absmax2, __half **code2, float *offset) { + FILE *fp = fopen(filename, "rb"); + if (!fp) { + fprintf(stderr, "无法打开文件: %s\n", filename); + return -1; + } + + if (!read_exact(fp, rows, sizeof(int64_t), 1) || + !read_exact(fp, cols, sizeof(int64_t), 1) || + !read_exact(fp, blocksize, sizeof(int32_t), 1)) { + fprintf(stderr, "读取文件头失败: %s\n", filename); + fclose(fp); + return -1; + } + + int64_t total_elements = (*rows) * (*cols); + int64_t num_packed = (total_elements + 1) / 2; + int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + printf("\n文件信息:\n"); + printf(" 矩阵: %ld x %ld\n", *rows, *cols); + printf(" 总元素数: %ld\n", total_elements); + printf(" blocksize: %d\n", *blocksize); + printf(" 打包数据大小: %ld bytes\n", num_packed); + printf(" 量化块数: %ld\n", num_blocks); + printf(" 分组数: %ld\n", num_groups); + + *packed = (uint8_t *)malloc(num_packed); + *absmax_q = (uint8_t *)malloc(num_blocks); + *absmax2 = (__half *)malloc(num_groups * sizeof(__half)); + *code2 = (__half *)malloc(256 * sizeof(__half)); + + if (!*packed || !*absmax_q || !*absmax2 || !*code2) { + fprintf(stderr, "主机内存分配失败\n"); + fclose(fp); + return -1; + } + + if (!read_exact(fp, *packed, 1, num_packed) || + !read_exact(fp, *absmax_q, 1, num_blocks) || + !read_exact(fp, *absmax2, sizeof(__half), num_groups) || + !read_exact(fp, *code2, sizeof(__half), 256) || + !read_exact(fp, offset, sizeof(float), 1)) { + fprintf(stderr, "读取量化数据失败: %s\n", filename); + fclose(fp); + return -1; + } + + fclose(fp); + printf("文件读取成功\n"); + return 0; +} + +void save_dequantized_weight(const char *filename, __half *weight, + int64_t total_elements) { + ensure_directory_exists("../mx_results"); + + char full_path[512]; + snprintf(full_path, sizeof(full_path), "../mx_results/%s", filename); + + FILE *fp = fopen(full_path, "wb"); + if (!fp) { + fprintf(stderr, "无法创建输出文件: %s\n", full_path); + return; + } + + fwrite(weight, sizeof(__half), total_elements, fp); + fclose(fp); + + printf("已保存解量化结果: %s (%.2f MB)\n", full_path, + (total_elements * sizeof(__half)) / (1024.0 * 1024.0)); +} + +double get_time_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; +} + +double calculate_bandwidth(int64_t total_elements, double time_ms, int blocksize) { + int64_t input_bytes = (total_elements + 1) / 2; + int64_t num_blocks = (total_elements + blocksize - 1) / blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + input_bytes += num_blocks; + input_bytes += num_groups * sizeof(__half); + input_bytes += 256 * sizeof(__half); + + int64_t output_bytes = total_elements * sizeof(__half); + int64_t total_bytes = input_bytes + output_bytes; + + return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0); +} + +int main(int argc, char **argv) { + if (argc != 2) { + printf("\n使用方法: %s <权重文件.bin>\n", argv[0]); + printf("示例: %s ../weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]); + return -1; + } + + const char *input_file = argv[1]; + + ensure_directory_exists("../mx_results"); + init_nf4_lut(); + + int64_t rows, cols; + int blocksize; + uint8_t *h_packed = nullptr; + uint8_t *h_absmax_q = nullptr; + __half *h_absmax2 = nullptr; + __half *h_code2 = nullptr; + float offset = 0.f; + + if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed, + &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) { + return -1; + } + + int64_t total_elements = rows * cols; + int64_t num_packed = (total_elements + 1) / 2; + int64_t num_blocks = (total_elements + blocksize - 1) / blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + uint8_t *d_packed = nullptr; + uint8_t *d_absmax_q = nullptr; + __half *d_absmax2 = nullptr; + __half *d_code2 = nullptr; + __half *d_output = nullptr; + + GPU_CHECK(mcMalloc(&d_packed, num_packed)); + GPU_CHECK(mcMalloc(&d_absmax_q, num_blocks)); + GPU_CHECK(mcMalloc(&d_absmax2, num_groups * sizeof(__half))); + GPU_CHECK(mcMalloc(&d_code2, 256 * sizeof(__half))); + GPU_CHECK(mcMalloc(&d_output, total_elements * sizeof(__half))); + + GPU_CHECK(mcMemcpy(d_packed, h_packed, num_packed, mcMemcpyHostToDevice)); + GPU_CHECK( + mcMemcpy(d_absmax_q, h_absmax_q, num_blocks, mcMemcpyHostToDevice)); + GPU_CHECK(mcMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half), + mcMemcpyHostToDevice)); + GPU_CHECK( + mcMemcpy(d_code2, h_code2, 256 * sizeof(__half), mcMemcpyHostToDevice)); + GPU_CHECK(mcMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half))); + + __half *h_output = (__half *)malloc(total_elements * sizeof(__half)); + if (!h_output) { + fprintf(stderr, "主机输出内存分配失败\n"); + return -1; + } + + int threads = 256; + int64_t total_bytes = total_elements >> 1; + int blocks = (total_bytes / 4 + threads - 1) / threads; + + for (int i = 0; i < 5; ++i) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, + total_elements, blocksize, 256, + d_output); + } + GPU_CHECK(mcDeviceSynchronize()); + + double start_time = get_time_ms(); + for (int i = 0; i < 100; ++i) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, + total_elements, blocksize, 256, + d_output); + } + GPU_CHECK(mcDeviceSynchronize()); + double end_time = get_time_ms(); + + double avg_time_ms = (end_time - start_time) / 100.0; + double bandwidth = calculate_bandwidth(total_elements, avg_time_ms, blocksize); + + GPU_CHECK(mcMemcpy(h_output, d_output, total_elements * sizeof(__half), + mcMemcpyDeviceToHost)); + + char output_file[256]; + snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows, + cols, blocksize); + save_dequantized_weight(output_file, h_output, total_elements); + + char log_file[256]; + snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols, + blocksize); + + char log_path[512]; + snprintf(log_path, sizeof(log_path), "../mx_results/%s", log_file); + FILE *log_fp = fopen(log_path, "w"); + if (log_fp) { + fprintf(log_fp, "input_file=%s\n", input_file); + fprintf(log_fp, "rows=%ld\n", rows); + fprintf(log_fp, "cols=%ld\n", cols); + fprintf(log_fp, "blocksize=%d\n", blocksize); + fprintf(log_fp, "total_elements=%ld\n", total_elements); + fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms); + fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth); + fprintf(log_fp, "output_file=mx_results/%s\n", output_file); + fclose(log_fp); + } + + printf("\n输入文件: %s\n", input_file); + printf("矩阵大小: %ld x %ld\n", rows, cols); + printf("核函数执行时间: %.4f ms\n", avg_time_ms); + printf("有效内存带宽: %.2f GB/s\n", bandwidth); + printf("输出文件: mx_results/%s\n", output_file); + + free(h_packed); + free(h_absmax_q); + free(h_absmax2); + free(h_code2); + free(h_output); + + mcFree(d_packed); + mcFree(d_absmax_q); + mcFree(d_absmax2); + mcFree(d_code2); + mcFree(d_output); + + return 0; +} diff --git a/03_nf4_dequant/ayepei/mx/run.md b/03_nf4_dequant/ayepei/mx/run.md new file mode 100644 index 0000000..3bcdb05 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/run.md @@ -0,0 +1,45 @@ +# 沐曦 NF4 反量化 + +## 1) 编译 + +优先使用 `mxcc`(沐曦环境),没有则回退 `nvcc`(CUDA 兼容环境)。 + +```bash +cd 03_nf4_dequant/mx + +# 方式1:脚本自动选编译器 +bash run_all.sh + +# 方式2:手动编译 +mxcc -O3 -std=c++17 -o nf4_dequant_mx nf4_dequant_mx.maca + + +## 2) 单文件运行 + +```bash +./nf4_dequant_mx ../weight_data/weight_1024x1024_bs64.bin +``` + +输出: +- 解量化结果:`../mx_results/dequant__bs.fp16` +- 性能日志:`../mx_results/perf__bs.log` + +## 3) 批量运行 + +```bash +bash run_all.sh +``` + + + +## 4) 与 BnB 结果对比 + +先保证 `../bnb_results` 与 `../bnb_benchmark_results.csv` 已存在。 + +```bash +python compare_results.py +``` + +将生成: +- `comparison_mx_results.csv` +- `comparison_mx_results.md` diff --git a/03_nf4_dequant/ayepei/mx/run_all.sh b/03_nf4_dequant/ayepei/mx/run_all.sh new file mode 100755 index 0000000..41b4097 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx/run_all.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -euo pipefail + +if [ ! -f "./nf4_dequant_mx" ]; then + echo "编译沐曦 NF4 程序..." + if command -v mxcc >/dev/null 2>&1; then + mxcc -O3 -std=c++17 -o nf4_dequant_mx nf4_dequant_mx.maca + else + echo "未找到 mxcc,请先加载沐曦编译环境。" + exit 1 + fi +fi + +for f in ../weight_data/*.bin; do + ./nf4_dequant_mx "$f" +done diff --git a/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log new file mode 100644 index 0000000..373191b --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs128.bin +rows=1024 +cols=1024 +blocksize=128 +total_elements=1048576 +kernel_time_ms=0.0248 +bandwidth_gbps=98.89 +output_file=mx_results/dequant_1024x1024_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log new file mode 100644 index 0000000..4fca2fe --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_1024x1024_bs64.bin +rows=1024 +cols=1024 +blocksize=64 +total_elements=1048576 +kernel_time_ms=0.0169 +bandwidth_gbps=144.99 +output_file=mx_results/dequant_1024x1024_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log new file mode 100644 index 0000000..4f974bb --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs128.bin +rows=16384 +cols=16384 +blocksize=128 +total_elements=268435456 +kernel_time_ms=0.9282 +bandwidth_gbps=675.43 +output_file=mx_results/dequant_16384x16384_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log new file mode 100644 index 0000000..5150ce6 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_16384x16384_bs64.bin +rows=16384 +cols=16384 +blocksize=64 +total_elements=268435456 +kernel_time_ms=1.0106 +bandwidth_gbps=622.36 +output_file=mx_results/dequant_16384x16384_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log new file mode 100644 index 0000000..eb8695b --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs128.bin +rows=2048 +cols=2048 +blocksize=128 +total_elements=4194304 +kernel_time_ms=0.0325 +bandwidth_gbps=301.44 +output_file=mx_results/dequant_2048x2048_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log new file mode 100644 index 0000000..84487fa --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_2048x2048_bs64.bin +rows=2048 +cols=2048 +blocksize=64 +total_elements=4194304 +kernel_time_ms=0.0281 +bandwidth_gbps=349.12 +output_file=mx_results/dequant_2048x2048_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log new file mode 100644 index 0000000..fd8002e --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs128.bin +rows=256 +cols=256 +blocksize=128 +total_elements=65536 +kernel_time_ms=0.0093 +bandwidth_gbps=16.44 +output_file=mx_results/dequant_256x256_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log new file mode 100644 index 0000000..fff7366 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_256x256_bs64.bin +rows=256 +cols=256 +blocksize=64 +total_elements=65536 +kernel_time_ms=0.0094 +bandwidth_gbps=16.44 +output_file=mx_results/dequant_256x256_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log new file mode 100644 index 0000000..3d4f92b --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs128.bin +rows=3421 +cols=3146 +blocksize=128 +total_elements=10762466 +kernel_time_ms=0.0509 +bandwidth_gbps=494.16 +output_file=mx_results/dequant_3421x3146_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log new file mode 100644 index 0000000..1ce3918 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_3421x3146_bs64.bin +rows=3421 +cols=3146 +blocksize=64 +total_elements=10762466 +kernel_time_ms=0.0517 +bandwidth_gbps=488.13 +output_file=mx_results/dequant_3421x3146_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log new file mode 100644 index 0000000..139738b --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs128.bin +rows=4096 +cols=4096 +blocksize=128 +total_elements=16777216 +kernel_time_ms=0.0813 +bandwidth_gbps=482.23 +output_file=mx_results/dequant_4096x4096_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log new file mode 100644 index 0000000..b793bac --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_4096x4096_bs64.bin +rows=4096 +cols=4096 +blocksize=64 +total_elements=16777216 +kernel_time_ms=0.0735 +bandwidth_gbps=534.60 +output_file=mx_results/dequant_4096x4096_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log new file mode 100644 index 0000000..fa81358 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs128.bin +rows=512 +cols=512 +blocksize=128 +total_elements=262144 +kernel_time_ms=0.0154 +bandwidth_gbps=39.84 +output_file=mx_results/dequant_512x512_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log new file mode 100644 index 0000000..5b53834 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_512x512_bs64.bin +rows=512 +cols=512 +blocksize=64 +total_elements=262144 +kernel_time_ms=0.0154 +bandwidth_gbps=39.91 +output_file=mx_results/dequant_512x512_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log new file mode 100644 index 0000000..0afbdb3 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs128.bin +rows=6578 +cols=1236 +blocksize=128 +total_elements=8130408 +kernel_time_ms=0.0402 +bandwidth_gbps=472.88 +output_file=mx_results/dequant_6578x1236_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log new file mode 100644 index 0000000..ae554f6 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_6578x1236_bs64.bin +rows=6578 +cols=1236 +blocksize=64 +total_elements=8130408 +kernel_time_ms=0.0485 +bandwidth_gbps=392.61 +output_file=mx_results/dequant_6578x1236_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log new file mode 100644 index 0000000..6b99edc --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs128.bin +rows=7000 +cols=7000 +blocksize=128 +total_elements=49000000 +kernel_time_ms=0.1881 +bandwidth_gbps=608.44 +output_file=mx_results/dequant_7000x7000_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log new file mode 100644 index 0000000..3ea1f6d --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_7000x7000_bs64.bin +rows=7000 +cols=7000 +blocksize=64 +total_elements=49000000 +kernel_time_ms=0.1905 +bandwidth_gbps=602.62 +output_file=mx_results/dequant_7000x7000_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log new file mode 100644 index 0000000..b10832c --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs128.bin +rows=8192 +cols=8192 +blocksize=128 +total_elements=67108864 +kernel_time_ms=0.2358 +bandwidth_gbps=664.67 +output_file=mx_results/dequant_8192x8192_bs128.fp16 diff --git a/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log new file mode 100644 index 0000000..7d0ca25 --- /dev/null +++ b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log @@ -0,0 +1,8 @@ +input_file=../weight_data/weight_8192x8192_bs64.bin +rows=8192 +cols=8192 +blocksize=64 +total_elements=67108864 +kernel_time_ms=0.2625 +bandwidth_gbps=599.08 +output_file=mx_results/dequant_8192x8192_bs64.fp16 diff --git a/03_nf4_dequant/ayepei/nv/compare_results.py b/03_nf4_dequant/ayepei/nv/compare_results.py new file mode 100644 index 0000000..374f226 --- /dev/null +++ b/03_nf4_dequant/ayepei/nv/compare_results.py @@ -0,0 +1,146 @@ +import torch +import numpy as np +import pandas as pd +import csv +from pathlib import Path + + +def compare_all(): + """对比所有 cuda 和 BnB 的结果""" + + BASE = Path("..") + + cuda_DIR = BASE / "cuda_results" + BNB_DIR = BASE / "bnb_results" + BNB_CSV = BASE / "bnb_benchmark_results.csv" + + results = [] + + # 读取 BnB benchmark CSV + if not BNB_CSV.exists(): + print(f" 找不到 {BNB_CSV}") + return + + bnb_data = {} + with open(BNB_CSV, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + key = f"{row['Shape']}_bs{row['Blocksize']}" + bnb_data[key] = { + 'time_ms': float(row['BnB Time (ms)']), + 'bnb_file': row['BnB Output File'] + } + + # 查找所有 cuda 结果 + cuda_files = list(cuda_DIR.glob("dequant_*.fp16")) + + print("\n" + "="*80) + print("对比结果汇总") + print("="*80) + print(f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'cuda (ms)':<12} " + f"{'Speedup':<8} {'MAE':<12} {'Max Diff':<12}") + print("-"*80) + + for cuda_file in cuda_files: + filename = cuda_file.name + + parts = filename.replace('dequant_', '').replace('.fp16', '').split('_bs') + shape = parts[0] + blocksize = parts[1] + + # 读取 cuda 输出 + cuda_data = np.fromfile(cuda_file, dtype=np.float16) + cuda_tensor = torch.from_numpy(cuda_data) + + # 读取 BnB 输出 + bnb_file = BNB_DIR / f"bnb_{shape}_bs{blocksize}.fp16" + if not bnb_file.exists(): + print(f" 找不到 BnB 文件: {bnb_file}") + continue + + bnb_data_np = np.fromfile(bnb_file, dtype=np.float16) + bnb_tensor = torch.from_numpy(bnb_data_np) + + + + # 误差计算 + mae = torch.mean(torch.abs(bnb_tensor - cuda_tensor)).item() + mse = torch.mean((bnb_tensor - cuda_tensor) ** 2).item() + max_diff = torch.max(torch.abs(bnb_tensor - cuda_tensor)).item() + + # 时间读取 + key = f"{shape}_bs{blocksize}" + bnb_time = bnb_data[key]['time_ms'] + + log_file = cuda_DIR / f"perf_{shape}_bs{blocksize}.log" + cuda_time = 0 + + if log_file.exists(): + with open(log_file, 'r') as f: + for line in f: + if 'kernel_time_ms' in line: + cuda_time = float(line.strip().split('=')[1]) + + speedup = bnb_time / cuda_time if cuda_time > 0 else 0 + + print(f"{shape:<12} {blocksize:<6} " + f"{bnb_time:<12.4f} {cuda_time:<12.4f} " + f"{speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}") + + results.append({ + 'shape': shape, + 'blocksize': int(blocksize), + 'bnb_time_ms': bnb_time, + 'cuda_time_ms': cuda_time, + 'speedup': speedup, + 'mae': mae, + 'mse': mse, + 'max_diff': max_diff, + }) + + print("="*80) + + + # 保存 CSV + df = pd.DataFrame(results) + out_csv = "comparison_cuda_results.csv" + float_format = '%.8f' + df.to_csv(out_csv, index=False, float_format=float_format) + + print(f"\n 对比结果已保存到: {out_csv}") + md_file = "comparison_cuda_results.md" + with open(md_file, 'w', encoding='utf-8') as f: + # 写入表格标题 + f.write("# cuda vs BnB 对比结果\n\n") + + # 写入表头 + f.write("| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |\n") + f.write("|-------|-------|----------|-----------|---------|-----|----------|\n") + + # 写入数据 + for _, row in df.iterrows(): + line = (f"| {row['shape']} | {row['blocksize']} | " + f"{row['bnb_time_ms']:.4f} | {row['cuda_time_ms']:.4f} | " + f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |") + f.write(line + '\n') + + # 添加详细数据表 + f.write("\n\n## 详细数据\n\n") + f.write("| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | MSE | Max Diff |\n") + f.write("|-------|-------|----------|-----------|---------|-----|-----|----------|\n") + + for _, row in df.iterrows(): + line = (f"| {row['shape']} | {row['blocksize']} | " + f"{row['bnb_time_ms']:.4f} | {row['cuda_time_ms']:.4f} | " + f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | " + f"{row['max_diff']:.8f} |") + f.write(line + '\n') + + print(f" Markdown表格已保存到: {md_file}") + + return df + + + +if __name__ == "__main__": + compare_all() \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv new file mode 100644 index 0000000..8e1718b --- /dev/null +++ b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv @@ -0,0 +1,21 @@ +shape,blocksize,bnb_time_ms,cuda_time_ms,speedup,mae,mse,max_diff +4096x4096,128,0.09050000,0.01340000,6.75373134,0.00020349,0.00000018,0.00390625 +4096x4096,64,0.08910000,0.01370000,6.50364964,0.00018179,0.00000018,0.00390625 +3421x3146,64,0.09250000,0.00860000,10.75581395,0.00017142,0.00000012,0.00390625 +7000x7000,128,0.09390000,0.04120000,2.27912621,0.00025988,0.00000024,0.00390625 +2048x2048,64,0.09100000,0.00470000,19.36170213,0.00022352,0.00000018,0.00390625 +8192x8192,128,0.09120000,0.05520000,1.65217391,0.00022542,0.00000018,0.00390625 +6578x1236,64,0.08950000,0.00700000,12.78571429,0.00022066,0.00000018,0.00390625 +2048x2048,128,0.09250000,0.00480000,19.27083333,0.00029397,0.00000030,0.00390625 +7000x7000,64,0.09090000,0.04160000,2.18509615,0.00024140,0.00000024,0.00390625 +256x256,128,0.10640000,0.00280000,38.00000000,0.00024271,0.00000024,0.00390625 +256x256,64,0.11440000,0.00280000,40.85714286,0.00022900,0.00000024,0.00390625 +6578x1236,128,0.08760000,0.00700000,12.51428571,0.00026894,0.00000024,0.00390625 +512x512,64,0.10100000,0.00280000,36.07142857,0.00018466,0.00000018,0.00390625 +16384x16384,128,0.29730000,0.21040000,1.41302281,0.00022352,0.00000018,0.00390625 +16384x16384,64,0.30740000,0.21100000,1.45687204,0.00019693,0.00000018,0.00390625 +8192x8192,64,0.09180000,0.05560000,1.65107914,0.00017440,0.00000012,0.00390625 +1024x1024,64,0.09600000,0.00300000,32.00000000,0.00028801,0.00000030,0.00390625 +3421x3146,128,0.09040000,0.00850000,10.63529412,0.00026965,0.00000024,0.00390625 +1024x1024,128,0.09120000,0.00300000,30.40000000,0.00024605,0.00000024,0.00390625 +512x512,128,0.09770000,0.00260000,37.57692308,0.00029063,0.00000030,0.00390625 diff --git a/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md new file mode 100644 index 0000000..d6bd827 --- /dev/null +++ b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md @@ -0,0 +1,50 @@ +# cuda vs BnB 对比结果 + +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0134 | 6.75 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0137 | 6.50 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0086 | 10.76 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.0412 | 2.28 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.0552 | 1.65 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0070 | 12.79 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0048 | 19.27 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.0416 | 2.19 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.2104 | 1.41 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 0.2110 | 1.46 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.0556 | 1.65 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0030 | 32.00 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0085 | 10.64 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0030 | 30.40 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0026 | 37.58 | 0.00029063 | 0.00390625 | + + +## 详细数据 + +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | MSE | Max Diff | +|-------|-------|----------|-----------|---------|-----|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0134 | 6.75 | 0.00020349 | 0.00000018 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0137 | 6.50 | 0.00018179 | 0.00000018 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0086 | 10.76 | 0.00017142 | 0.00000012 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.0412 | 2.28 | 0.00025988 | 0.00000024 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00000018 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.0552 | 1.65 | 0.00022542 | 0.00000018 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0070 | 12.79 | 0.00022066 | 0.00000018 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0048 | 19.27 | 0.00029397 | 0.00000030 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.0416 | 2.19 | 0.00024140 | 0.00000024 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00000024 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00000024 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00000024 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00000018 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.2104 | 1.41 | 0.00022352 | 0.00000018 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 0.2110 | 1.46 | 0.00019693 | 0.00000018 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.0556 | 1.65 | 0.00017440 | 0.00000012 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0030 | 32.00 | 0.00028801 | 0.00000030 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0085 | 10.64 | 0.00026965 | 0.00000024 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0030 | 30.40 | 0.00024605 | 0.00000024 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0026 | 37.58 | 0.00029063 | 0.00000030 | 0.00390625 | diff --git a/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda new file mode 100755 index 0000000..352cffb Binary files /dev/null and b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda differ diff --git a/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu new file mode 100644 index 0000000..e213708 --- /dev/null +++ b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu @@ -0,0 +1,643 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// 将表放入到常量内存加快速度 +__constant__ __half NF4_LUT_HALF[16]; +__constant__ __half CODE2_LUT[256]; + +void ensure_directory_exists(const char *path) { + struct stat st = {0}; + if (stat(path, &st) == -1) { +#ifdef _WIN32 + mkdir(path); +#else + mkdir(path, 0755); +#endif + } +} +__constant__ float NF4_LUT[16] = { + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f, + -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f}; + +// 初始化 LUT (将 float 转换为 half) +void init_nf4_lut() { + float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f, + -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f}; + + __half lut_h[16]; + for (int i = 0; i < 16; i++) { + lut_h[i] = __float2half(lut_f[i]); + } + CUDA_CHECK(cudaMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h))); +} +// 基础nf4反量化,在计算中使用float进行计算 +__global__ void nf4_dequant_v1( + const uint8_t *__restrict__ packed, const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, // 修改为 __half + const __half *__restrict__ code2, // 修改为 __half + float offset, int64_t total_half_elements, int blocksize, int group_size, + __half2 *__restrict__ output) { + + int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= total_half_elements) + return; + + uint8_t val = packed[tid]; + + int64_t idx_in_elements = tid << 1; + int64_t block_idx = idx_in_elements / blocksize; + int64_t group_idx = block_idx / group_size; + + + float s1 = __half2float(code2[absmax_q[block_idx]]); + float s2 = __half2float(absmax2[group_idx]); + float scale = (s1 * s2) + offset; + + float v1 = NF4_LUT[val >> 4] * scale; + float v2 = NF4_LUT[val & 0x0F] * scale; + + output[tid] = __floats2half2_rn(v1, v2); +} + + +// 中间的计算采用half +__global__ void nf4_dequant_v2( + const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, + const __half *__restrict__ code2, + float offset, + int64_t total_half_elements, + int blocksize, + int group_size, + __half2 *__restrict__ output) +{ + int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= total_half_elements) return; + + uint8_t val = packed[tid]; + + int64_t idx = tid << 1; + int64_t block_idx = idx / blocksize; + int64_t group_idx = block_idx / group_size; + + // 全half scale + __half s1 = code2[absmax_q[block_idx]]; + __half s2 = absmax2[group_idx]; + + __half scale = __hadd(__hmul(s1, s2),__float2half(offset) ); + + // LUT + __half v1 = __hmul(__float2half(NF4_LUT[val >> 4]), scale); + __half v2 = __hmul(__float2half(NF4_LUT[val & 0x0F]), scale); + + output[tid] = __halves2half2(v1, v2); +} + + + +// 预先先对NF4_LUT_HALF进行处理,转化为__half +__global__ void nf4_dequant_v3( + const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, + const __half *__restrict__ code2, + float offset, + int64_t total_half_elements, + int blocksize, + int group_size, + __half2 *__restrict__ output) +{ + int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= total_half_elements) return; + uint8_t val = packed[tid]; + int64_t idx = tid << 1; + int64_t block_idx = idx / blocksize; + int64_t group_idx = block_idx / group_size; + __half scale = __hadd(__hmul(code2[absmax_q[block_idx]], absmax2[group_idx]),__float2half(offset) ); + __half v1 = __hmul(NF4_LUT_HALF[val >> 4], scale); + __half v2 = __hmul(NF4_LUT_HALF[val & 0x0F], scale); + + output[tid] = __halves2half2(v1, v2); +} +// 利用共享内存存储scale,但当几个线程在进行存储时其他的线程并没有做其他的处理,时间并没有缩短 +__global__ void nf4_dequant_v4( + const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, + const __half *__restrict__ code2, + float offset, + int64_t total_half_elements, + int blocksize, + int group_size, + __half2 *__restrict__ output) +{ + extern __shared__ __half shared_scale[]; + + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + tid; + + int64_t cta_half_start = (int64_t)blockIdx.x * blockDim.x * 2; + int64_t cta_half_end = min(cta_half_start + blockDim.x * 2,total_half_elements * 2); + + int first_block = cta_half_start / blocksize; + int last_block = (cta_half_end + blocksize - 1) / blocksize; + + int num_scale = last_block - first_block; + + for (int i = tid; i < num_scale; i += blockDim.x) + { + int block_idx = first_block + i; + int group_idx = block_idx / group_size; + + __half s1 = code2[absmax_q[block_idx]]; + __half s2 = absmax2[group_idx]; + + shared_scale[i] = __hadd(__hmul(s1, s2), __float2half(offset)); + } + + __syncthreads(); + + if (idx >= total_half_elements) return; + + uint8_t val = packed[idx]; + + int64_t half_idx = ((int64_t)idx) << 1; + + int block_idx = half_idx / blocksize; + int local_block = block_idx - first_block; + + __half scale = shared_scale[local_block]; + + __half v1 = __hmul(NF4_LUT_HALF[val >> 4], scale); + __half v2 = __hmul(NF4_LUT_HALF[val & 0xF], scale); + + output[idx] = __halves2half2(v1, v2); +} + +// 进行向量化读取,一个程序处理多个 +__global__ void nf4_dequant_v5( + const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, + const __half *__restrict__ code2, + float offset, + int64_t total_half_elements, + int blocksize, + int group_size, + __half *__restrict__ output) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_half_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + if (byte_idx >= total_bytes) return; + uint32_t pack4 = ((const uint32_t*)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8 ) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = __hadd(__hmul(code2[absmax_q[block_idx]], absmax2[group_idx]), __float2half(offset)); + __half h[8]; + h[0] = __hmul(NF4_LUT_HALF[(b0)>>4], scale); + h[1] = __hmul(NF4_LUT_HALF[(b0)&0xF], scale); + h[2] = __hmul(NF4_LUT_HALF[(b1)>>4], scale); + h[3] = __hmul(NF4_LUT_HALF[(b1)&0xF], scale); + h[4] = __hmul(NF4_LUT_HALF[(b2)>>4], scale); + h[5] = __hmul(NF4_LUT_HALF[(b2)&0xF], scale); + h[6] = __hmul(NF4_LUT_HALF[(b3)>>4], scale); + h[7] = __hmul(NF4_LUT_HALF[(b3)&0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half*>(&out_pack)[0] = h[0]; + reinterpret_cast<__half*>(&out_pack)[1] = h[1]; + reinterpret_cast<__half*>(&out_pack)[2] = h[2]; + reinterpret_cast<__half*>(&out_pack)[3] = h[3]; + reinterpret_cast<__half*>(&out_pack)[4] = h[4]; + reinterpret_cast<__half*>(&out_pack)[5] = h[5]; + reinterpret_cast<__half*>(&out_pack)[6] = h[6]; + reinterpret_cast<__half*>(&out_pack)[7] = h[7]; + + ((uint4*)(output + half_base))[0] = out_pack; +} +// 将code也放入常量内存 +__global__ void nf4_dequant_v6(const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, float offset, + int64_t total_elements, int blocksize, + int group_size, __half *__restrict__ output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + if (byte_idx >= total_bytes) + return; + uint32_t pack4 = ((const uint32_t *)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = + __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]), + __float2half(offset)); + __half h[8]; + h[0] = __hmul(NF4_LUT_HALF[(b0) >> 4], scale); + h[1] = __hmul(NF4_LUT_HALF[(b0) & 0xF], scale); + h[2] = __hmul(NF4_LUT_HALF[(b1) >> 4], scale); + h[3] = __hmul(NF4_LUT_HALF[(b1) & 0xF], scale); + h[4] = __hmul(NF4_LUT_HALF[(b2) >> 4], scale); + h[5] = __hmul(NF4_LUT_HALF[(b2) & 0xF], scale); + h[6] = __hmul(NF4_LUT_HALF[(b3) >> 4], scale); + h[7] = __hmul(NF4_LUT_HALF[(b3) & 0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half *>(&out_pack)[0] = h[0]; + reinterpret_cast<__half *>(&out_pack)[1] = h[1]; + reinterpret_cast<__half *>(&out_pack)[2] = h[2]; + reinterpret_cast<__half *>(&out_pack)[3] = h[3]; + reinterpret_cast<__half *>(&out_pack)[4] = h[4]; + reinterpret_cast<__half *>(&out_pack)[5] = h[5]; + reinterpret_cast<__half *>(&out_pack)[6] = h[6]; + reinterpret_cast<__half *>(&out_pack)[7] = h[7]; + + ((uint4 *)(output + half_base))[0] = out_pack; +} +// 将NF4_LUT由常量内存放到共享内存中来 +__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed, + const uint8_t *__restrict__ absmax_q, + const __half *__restrict__ absmax2, float offset, + int64_t total_elements, int blocksize, + int group_size, __half *__restrict__ output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t total_bytes = total_elements >> 1; // 2 half / byte + int64_t byte_idx = (int64_t)tid * 4; + __shared__ __half s_nf4[16]; + + if(threadIdx.x<16){ + s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x]; + } + if (byte_idx >= total_bytes) + return; + uint32_t pack4 = ((const uint32_t *)packed)[tid]; + uint8_t b0 = pack4 & 0xFF; + uint8_t b1 = (pack4 >> 8) & 0xFF; + uint8_t b2 = (pack4 >> 16) & 0xFF; + uint8_t b3 = (pack4 >> 24) & 0xFF; + int64_t half_base = byte_idx << 1; + int block_idx = half_base / blocksize; + int group_idx = block_idx / group_size; + __half scale = + __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]), + __float2half(offset)); + __half h[8]; + h[0] = __hmul(s_nf4[(b0) >> 4], scale); + h[1] = __hmul(s_nf4[(b0) & 0xF], scale); + h[2] = __hmul(s_nf4[(b1) >> 4], scale); + h[3] = __hmul(s_nf4[(b1) & 0xF], scale); + h[4] = __hmul(s_nf4[(b2) >> 4], scale); + h[5] = __hmul(s_nf4[(b2) & 0xF], scale); + h[6] = __hmul(s_nf4[(b3) >> 4], scale); + h[7] = __hmul(s_nf4[(b3) & 0xF], scale); + + uint4 out_pack; + reinterpret_cast<__half *>(&out_pack)[0] = h[0]; + reinterpret_cast<__half *>(&out_pack)[1] = h[1]; + reinterpret_cast<__half *>(&out_pack)[2] = h[2]; + reinterpret_cast<__half *>(&out_pack)[3] = h[3]; + reinterpret_cast<__half *>(&out_pack)[4] = h[4]; + reinterpret_cast<__half *>(&out_pack)[5] = h[5]; + reinterpret_cast<__half *>(&out_pack)[6] = h[6]; + reinterpret_cast<__half *>(&out_pack)[7] = h[7]; + + ((uint4 *)(output + half_base))[0] = out_pack; +} +// 读取权重文件 +int read_weight_file(const char *filename, int64_t *rows, int64_t *cols, + int *blocksize, uint8_t **packed, uint8_t **absmax_q, + __half **absmax2, __half **code2, float *offset) { + + FILE *fp = fopen(filename, "rb"); + if (!fp) { + fprintf(stderr, " 无法打开文件: %s\n", filename); + return -1; + } + + // 读取 header + fread(rows, sizeof(int64_t), 1, fp); + fread(cols, sizeof(int64_t), 1, fp); + fread(blocksize, sizeof(int32_t), 1, fp); + + int64_t total_elements = (*rows) * (*cols); + int64_t num_packed = (total_elements + 1) / 2; + int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + printf("\n 文件信息:\n"); + printf(" 矩阵: %ld x %ld\n", *rows, *cols); + printf(" 总元素数: %ld\n", total_elements); + printf(" blocksize: %d\n", *blocksize); + printf(" 打包数据大小: %ld bytes\n", num_packed); + printf(" 量化块数: %ld\n", num_blocks); + printf(" 分组数: %ld\n", num_groups); + + // 分配内存 + *packed = (uint8_t *)malloc(num_packed); + *absmax_q = (uint8_t *)malloc(num_blocks); + *absmax2 = (__half *)malloc(num_groups * sizeof(__half)); + *code2 = (__half *)malloc(256 * sizeof(__half)); + + if (!*packed || !*absmax_q || !*absmax2 || !*code2) { + fprintf(stderr, " 主机内存分配失败\n"); + fclose(fp); + return -1; + } + + // 读取数据 + fread(*packed, 1, num_packed, fp); + fread(*absmax_q, 1, num_blocks, fp); + fread(*absmax2, sizeof(__half), num_groups, fp); + fread(*code2, sizeof(__half), 256, fp); + fread(offset, sizeof(float), 1, fp); + + fclose(fp); + printf(" 文件读取成功\n"); + return 0; +} + +// 保存解量化后的权重(自动保存到cuda_results目录) +void save_dequantized_weight(const char *filename, __half *weight, + int64_t total_elements) { + ensure_directory_exists("../cuda_results"); + + // 构建完整路径 + char full_path[512]; + snprintf(full_path, sizeof(full_path), "../cuda_results/%s", filename); + + FILE *fp = fopen(full_path, "wb"); + if (!fp) { + fprintf(stderr, " 无法创建输出文件: %s\n", full_path); + return; + } + + fwrite(weight, sizeof(__half), total_elements, fp); + fclose(fp); + + printf(" 已保存解量化结果: %s (%.2f MB)\n", full_path, + (total_elements * sizeof(__half)) / (1024.0 * 1024.0)); +} + + +// 计时器 (毫秒) +double get_time_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; +} + + +// 计算有效内存带宽 +double calculate_bandwidth(int64_t total_elements, double time_ms) { + // 输入数据大小 + int64_t input_bytes = (total_elements + 1) / 2; // packed + input_bytes += (total_elements + 64 - 1) / 64; // absmax_q (近似) + input_bytes += + ((total_elements + 64 - 1) / 64 + 255) / 256 * sizeof(__half); // absmax2 + input_bytes += 256 * sizeof(__half); // code2 + + // 输出数据大小 + int64_t output_bytes = total_elements * sizeof(__half); + + int64_t total_bytes = input_bytes + output_bytes; + + return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0); +} + +// ============================================================ +// 主函数 +// ============================================================ +int main(int argc, char **argv) { + + if (argc != 2) { + printf("\n使用方法: %s <权重文件.bin>\n", argv[0]); + printf(" 权重文件格式: 由 Python 脚本生成的 .bin 文件\n"); + printf(" 示例: %s weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]); + return -1; + } + + const char *input_file = argv[1]; + + // 确保输出目录存在 + ensure_directory_exists("../cuda_results"); + + // 初始化 LUT + printf("\n 初始化 NF4 LUT...\n"); + init_nf4_lut(); + + // 读取权重文件 + printf("\n 读取权重文件: %s\n", input_file); + int64_t rows, cols; + int blocksize; + uint8_t *h_packed, *h_absmax_q; + __half *h_absmax2, *h_code2; + float offset; + + if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed, + &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) { + return -1; + } + + int64_t total_elements = rows * cols; + int64_t num_units = (total_elements + 1) / 2; // 每个 uint8 包含两个 half + + // 计算 GPU 内存大小 + int64_t num_blocks = (total_elements + blocksize - 1) / blocksize; + int64_t num_groups = (num_blocks + 255) / 256; + + printf("\n 计算参数:\n"); + printf(" total_elements: %ld\n", total_elements); + printf(" num_units: %ld\n", num_units); + printf(" num_blocks: %ld\n", num_blocks); + printf(" num_groups: %ld\n", num_groups); + + // 分配 GPU 内存 + printf("\n 分配 GPU 内存...\n"); + uint8_t *d_packed, *d_absmax_q; + __half *d_absmax2, *d_code2, *d_output; + + CUDA_CHECK(cudaMalloc(&d_packed, num_units)); + CUDA_CHECK(cudaMalloc(&d_absmax_q, num_blocks)); + CUDA_CHECK(cudaMalloc(&d_absmax2, num_groups * sizeof(__half))); + CUDA_CHECK(cudaMalloc(&d_code2, 256 * sizeof(__half))); + CUDA_CHECK(cudaMalloc(&d_output, total_elements * sizeof(__half))); + + // 拷贝数据到 GPU + printf(" 拷贝数据到 GPU...\n"); + CUDA_CHECK(cudaMemcpy(d_packed, h_packed, num_units, cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(d_absmax_q, h_absmax_q, num_blocks, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_code2, h_code2, 256 * sizeof(__half), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half))); + + // 分配主机输出内存 + __half *h_output = (__half *)malloc(total_elements * sizeof(__half)); + if (!h_output) { + fprintf(stderr, " 主机输出内存分配失败\n"); + return -1; + } + + // 配置内核启动参数 + int threads = 256; + int64_t total_bytes = total_elements >> 1; + int blocks = (total_bytes / 4 + threads - 1) / threads; + // int blocks = (total_bytes + threads - 1) / threads; + int max_scale = (threads * 2 + blocksize - 1) / blocksize; + +size_t smem = max_scale * sizeof(__half); + printf("\n 内核配置:\n"); + printf(" blocks: %d\n", blocks); + printf(" threads per block: %d\n", threads); + printf(" 总线程数: %d\n", blocks * threads); + + // 预热 (5次) + printf("\n 预热 (5次)...\n"); + for (int i = 0; i < 5; i++) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, + total_elements, blocksize, 256, + d_output); + // nf4_dequant_v5<<>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset, + // total_elements, blocksize, 256, + // d_output); + // nf4_dequant_v1<<>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset, + // total_elements, blocksize, 256, + // reinterpret_cast<__half2*>(d_output)); + + // nf4_dequant_v4<<>>(d_packed, d_absmax_q, d_absmax2, d_code2, + // offset, total_elements, blocksize, 256, + // reinterpret_cast<__half2*>(d_output)); + + } + CUDA_CHECK(cudaDeviceSynchronize()); + + // 正式测试 (100次) + printf(" 性能测试 (100次迭代)...\n"); + double start_time = get_time_ms(); + + for (int i = 0; i < 100; i++) { + nf4_dequant_v7<<>>(d_packed, d_absmax_q, d_absmax2, offset, + total_elements, blocksize, 256, + d_output); + // nf4_dequant_v5<<>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset, + // total_elements, blocksize, 256, + // d_output); + // nf4_dequant_v1<<>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset, + // total_elements, blocksize, 256, + // reinterpret_cast<__half2*>(d_output)); + // nf4_dequant_v4<<>>(d_packed, d_absmax_q, d_absmax2, d_code2, + // offset, total_elements, blocksize, 256, + // reinterpret_cast<__half2*>(d_output)); + } + + CUDA_CHECK(cudaDeviceSynchronize()); + double end_time = get_time_ms(); + + double total_time = end_time - start_time; + double avg_time_ms = total_time / 100.0; + + // 计算带宽 + double bandwidth = calculate_bandwidth(total_elements, avg_time_ms); + + // 拷贝结果回主机 + printf(" 拷贝结果回主机...\n"); + CUDA_CHECK(cudaMemcpy(h_output, d_output, total_elements * sizeof(__half), + cudaMemcpyDeviceToHost)); + + // 生成输出文件名 + char output_file[256]; + snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows, + cols, blocksize); + + // 保存解量化结果(自动保存到cuda_results目录) + printf("\n 保存解量化结果...\n"); + save_dequantized_weight(output_file, h_output, total_elements); + + // 生成性能日志文件名 + char log_file[256]; + snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols, + blocksize); + + // 输出性能结果 + printf("输入文件: %s\n", input_file); + printf("矩阵大小: %ld x %ld\n", rows, cols); + printf("总元素数: %ld\n", total_elements); + printf("数据大小: %.2f MB\n", + total_elements * sizeof(__half) / (1024.0 * 1024.0)); + printf("\n"); + printf("核函数执行时间: %.4f ms\n", avg_time_ms); + printf("有效内存带宽: %.2f GB/s\n", bandwidth); + printf("\n"); + printf("输出文件: cuda_results/%s\n", output_file); + printf("日志文件: cuda_results/%s\n", log_file); + + + // 保存性能日志(也保存到cuda_results目录) + char log_path[512]; + snprintf(log_path, sizeof(log_path), "../cuda_results/%s", log_file); + + FILE *log_fp = fopen(log_path, "w"); + if (log_fp) { + fprintf(log_fp, "input_file=%s\n", input_file); + fprintf(log_fp, "rows=%ld\n", rows); + fprintf(log_fp, "cols=%ld\n", cols); + fprintf(log_fp, "blocksize=%d\n", blocksize); + fprintf(log_fp, "total_elements=%ld\n", total_elements); + fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms); + fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth); + fprintf(log_fp, "output_file=cuda_results/%s\n", output_file); + fclose(log_fp); + printf(" 性能日志已保存到: %s\n", log_path); + } + + // 清理 + free(h_packed); + free(h_absmax_q); + free(h_absmax2); + free(h_code2); + free(h_output); + + cudaFree(d_packed); + cudaFree(d_absmax_q); + cudaFree(d_absmax2); + cudaFree(d_code2); + cudaFree(d_output); + + printf("\n 测试完成!\n\n"); + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/ayepei/nv/run_all.sh b/03_nf4_dequant/ayepei/nv/run_all.sh new file mode 100755 index 0000000..a02a276 --- /dev/null +++ b/03_nf4_dequant/ayepei/nv/run_all.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# run_all.sh + +# 确保 CUDA 程序已编译 +if [ ! -f "./nf4_dequant_cuda" ]; then + echo "编译 CUDA 程序..." + nvcc -O3 -arch=sm_90 --use_fast_math -lineinfo -Xptxas -O3 nf4_dequant_cuda.cu -o nf4_dequant_cuda +fi + +# 处理所有权重文件 +for f in ../weight_data/*.bin; do + ./nf4_dequant_cuda "$f" +done \ No newline at end of file diff --git "a/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md" "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md" new file mode 100644 index 0000000..63e38e5 --- /dev/null +++ "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md" @@ -0,0 +1,358 @@ +# NF4反量化 +# 实验平台 +英伟达:GH200 +沐曦:C500 +摩尔线程:S5000 + +# 基准测试 +配置bitsandbytes环境进行基准测试,对于下列形状的权重矩阵生成,调用bitsandbytes进行量化及反量化,记录反量化的结果文件及运行的时间。与后续的实现进行平均误差和最大误差的对比。bitsandbytes测试在英伟达GH200进行。后续对比时各平台核函数先预热5次,再执行100次求平均时间。 +(256, 256) +(512, 512) +(1024, 1024) +(2048, 2048) +(4096, 4096) +(8192, 8192) +(16384, 16384) +(3421, 3146) +(6578, 1236) +(7000, 7000) +# python环境配置 +``` +conda create -n nf4 python=3.10 -y +pip install torch --index-url https://download.pytorch.org/whl/cu124 +pip install bitsandbytes numpy pandas +conda install -c nvidia cuda-nvcc cuda-runtime cuda-cudart -y +cd 03_nf4_dequant/ +python generate_and_benchmark_bnb.py #生成bitsandbytes基准进行对比 +``` +# 代码文件结构 +因为权重文件存储太大,不上传到github中 +Learning-CUDA +├── 03_nf4_dequant/ +│ ├── bnb_results/ # bitsandbytes 解量化输出结果 +│ ├── cuda_results/ # CUDA kernel 解量化输出结果 +│ ├── musa_results/ # MUSA 平台解量化结果 +│ ├── mx_results/ # MX 平台解量化结果 +│ ├── mexc/ # 摩尔线程平台实现代码 +│ ├── mx/ # 沐曦平台 kernel 实现 +│ ├── nv/ # CUDA kernel 实现 +│ ├── weight_data/ # NF4 量化后的权重测试数据 +│ ├── generate_and_benchmark_bnb.py # 生成 NF4 测试数据并运行 bitsandbytes benchmark +│ ├── bnb_benchmark_results.csv # bitsandbytes benchmark 结果 +│ ├── 总结报告.md # NF4 解量化实验总结 +│ └── README.md + +# nf4反量化理论 +``` +NF4[qi​]×(code2[absmax_q[block_idx]​]×absmax2​+offset) +``` +# 英伟达平台程序执行及性能分析 +```sh +cd nv +./run_all.sh //对于程序进行编译和对于所有权重文件运行,生成结果文件及时间 +python compare_results.py //进行性能和误差分析 +``` +# 英伟达优化思路 +## nf4_dequant_v1: +基础nf4反量化,实现两级缩放的解量化,建立NF4查找表(NF4_LUT),在计算中使用float进行计算,一个线程计算一个元素,NF4_LUT放入常量内存。每个线程一次处理两个 4-bit 索引,计算两个 BF16 值后打包成一个 32-bit uint32_t 一次性写入全局内存。 +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0666 | 1.36 | 0.00002557 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0678 | 1.31 | 0.00001973 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0436 | 2.12 | 0.00001746 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1926 | 0.49 | 0.00001764 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0183 | 4.97 | 0.00003022 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.3057 | 0.30 | 0.00002319 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0819 | 1.09 | 0.00003105 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0179 | 5.17 | 0.00001603 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.2325 | 0.39 | 0.00002593 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00002176 | 0.00195312 | +| 256x256 | 64 | 0.1144 | 0.0027 | 42.37 | 0.00003350 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0326 | 2.69 | 0.00001675 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0032 | 31.56 | 0.00001991 | 0.00195312 | +| 16384x16384 | 128 | 0.2973 | 1.6876 | 0.18 | 0.00002354 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.7599 | 0.17 | 0.00002462 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.3188 | 0.29 | 0.00001830 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0091 | 10.55 | 0.00001878 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0421 | 2.15 | 0.00001740 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0091 | 10.02 | 0.00001937 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0032 | 30.53 | 0.00001580 | 0.00195312 | +## nf4_dequant_v2 +半精比float计算快,将中间的变量转化为半精执行。 +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0670 | 1.35 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0681 | 1.31 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0437 | 2.12 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1929 | 0.49 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0185 | 4.92 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2631 | 0.35 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0335 | 2.67 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0181 | 5.11 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1984 | 0.46 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0027 | 39.41 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0027 | 42.37 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0328 | 2.67 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0033 | 30.61 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 1.0421 | 0.29 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0718 | 0.29 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2703 | 0.34 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0066 | 14.55 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0426 | 2.12 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0065 | 14.03 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0032 | 30.53 | 0.00029063 | 0.00390625 | +## nf4_dequant_v3 +NF4_LUT(NF4量化的查找表)每次都要从float转化为__half,在核函数启动前提前转化。 +下面对8192x8192大小的反量化进行ncu分析,下面各个核函数都是对8192×8192的分析。 +计算吞吐率达到90%,而内存吞吐只有10%左右。 +![NCU性能分析报告v3.1](img/v3_th.png) +查看warp状态,Stall MIO Throtle占主要部分 +![NCU性能分析报告v3.2](img/v3_warp.png) +查找具体是下面两个地方对于内存的读取导致的 +![NCU性能分析报告v3.3](img/v3_source_1.png) +![NCU性能分析报告v3.4](img/v3_source_2.png) +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0669 | 1.35 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0679 | 1.31 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0438 | 2.11 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1932 | 0.49 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0186 | 4.89 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2636 | 0.35 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0337 | 2.66 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0180 | 5.14 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1987 | 0.46 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0027 | 39.41 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0326 | 2.69 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0035 | 28.86 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 1.0441 | 0.28 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0733 | 0.29 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2710 | 0.34 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0066 | 14.55 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0428 | 2.11 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0065 | 14.03 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0034 | 28.74 | 0.00029063 | 0.00390625 | +## nf4_dequant_v4 +利用共享内存存储scale,但当几个线程在进行存储时其他的线程在等待,时间并没有缩短。 +相比于v3,内存吞吐率更低了。 +![NCU性能分析报告v4.1](img/v4_th.png) +查看共享内存状态大量bank conflicts +![NCU性能分析报告v4.2](img/v4_bank.png) +查看使用共享内存存在大量barrier +![NCU性能分析报告v4.3](img/v4_source_1.png) +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0802 | 1.13 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0821 | 1.09 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0531 | 1.74 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.2308 | 0.41 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0224 | 4.06 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.3140 | 0.29 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0408 | 2.19 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0219 | 4.22 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.2386 | 0.38 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0033 | 32.24 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0032 | 35.75 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0397 | 2.21 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0040 | 25.25 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 1.2404 | 0.24 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.2822 | 0.24 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.3244 | 0.28 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0080 | 12.00 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0518 | 1.75 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0079 | 11.54 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0040 | 24.42 | 0.00029063 | 0.00390625 | +## nf4_dequant_v5 +进行向量化读取,一个程序处理多个元素 +现在的计算吞吐达到95%,但内存吞吐还是很低。 +![NCU性能分析报告v5.1](img/v5_th.png) +还是大量存在Stall MIO Throtle +![NCU性能分析报告v5.2](img/v5_warp.png) +对于NF4_LUT还是有大量的Stall MIO Throtle,还有对于计算scale也有Stall MIO Throtle +![NCU性能分析报告v5.3](img/v5_source_1.png) +![NCU性能分析报告v5.4](img/v5_source_2.png) +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0847 | 1.07 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0928 | 0.96 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0452 | 2.05 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.3667 | 0.26 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0197 | 4.62 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2261 | 0.40 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0401 | 2.23 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0193 | 4.79 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1721 | 0.53 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0060 | 17.73 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0062 | 18.45 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0433 | 2.02 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0034 | 29.71 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 1.4397 | 0.21 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0827 | 0.28 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2343 | 0.39 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0094 | 10.21 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0445 | 2.03 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0058 | 15.72 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0067 | 14.58 | 0.00029063 | 0.00390625 | +## nf4_dequant_v6: +将code也放入常量内存,减少从全局内存进行读取 +现在的计算吞吐达到96%,但内存吞吐还是很低。 +![NCU性能分析报告v6.1](img/v6_th.png) +还是大量存在Stall MIO Throtle +![NCU性能分析报告v6.2](img/v6_warp.png) +对于NF4_LUT还是有大量的Stall MIO Throtle,还有对于code2的Stall MIO Throtle有所减少。 +![NCU性能分析报告v6.3](img/v6_source_1.png) +![NCU性能分析报告v6.4](img/v6_source_2.png) +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0594 | 1.52 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0621 | 1.43 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0404 | 2.29 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1671 | 0.56 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0176 | 5.17 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2273 | 0.40 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0316 | 2.83 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0168 | 5.51 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1754 | 0.52 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0035 | 30.40 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0035 | 32.69 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0300 | 2.92 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0035 | 28.86 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.8980 | 0.33 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 0.9441 | 0.33 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2389 | 0.38 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0062 | 15.48 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0385 | 2.35 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0060 | 15.20 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0036 | 27.14 | 0.00029063 | 0.00390625 | +## nf4_dequant_v7: +将NF4_LUT由常量内存放到共享内存中来,速度比v6版本大大提高了 + +内存吞吐达到60%,且计算时间大大减少 +![NCU性能分析报告v7.1](img/v7_th.png) +Stall MIO Throtle大大减少 +![NCU性能分析报告v7.2](img/v7_warp.png) +引入共享内存后出现bank conflicts +![NCU性能分析报告v7.3](img/v7_bank.png) +对于NF4_LUT已经计算scale的Stall MIO Throtle消失 +![NCU性能分析报告v7.3](img/v7_source_1.png) +![NCU性能分析报告v7.4](img/v7_source_2.png) +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0131 | 6.91 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0134 | 6.65 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0087 | 10.63 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.0449 | 2.09 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.0581 | 1.57 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0073 | 12.26 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0049 | 18.88 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.0418 | 2.17 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.2156 | 1.38 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 0.2292 | 1.34 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.0593 | 1.55 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0031 | 30.97 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0087 | 10.39 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0029 | 31.45 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0028 | 34.89 | 0.00029063 | 0.00390625 | +# 英伟达平台适配结果 +对于GH200对nf4_dequant_v7进行支持。执行结果权重文件为cuda_results目录下的.fp16为后缀的文件,log文件记录带宽及其执行时间。加速比在nv/comparison_cuda_results.md +| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|-----------|---------|-----|----------| +| 4096x4096 | 128 | 0.0905 | 0.0131 | 6.91 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0134 | 6.65 | 0.00018179 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0087 | 10.63 | 0.00017142 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.0449 | 2.09 | 0.00025988 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.0581 | 1.57 | 0.00022542 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0073 | 12.26 | 0.00022066 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0049 | 18.88 | 0.00029397 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.0418 | 2.17 | 0.00024140 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.2156 | 1.38 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 0.2292 | 1.34 | 0.00019693 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.0593 | 1.55 | 0.00017440 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0031 | 30.97 | 0.00028801 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0087 | 10.39 | 0.00026965 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0029 | 31.45 | 0.00024605 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0028 | 34.89 | 0.00029063 | 0.00390625 | +# 沐曦平台程序执行及性能分析 +```sh +cd mx +./run_all.sh 对于程序进行编译和对于所有权重文件运行 +python compare_results.py 进行性能和误差分析 +``` +# 沐曦适配结果 +对于沐曦S5000平台对nf4_dequant_v7进行支持,在大矩阵的表现较差。执行结果权重文件为mx_results目录下的.fp16为后缀的文件,log文件记录带宽及其执行时间。加速比在mx/comparison_mx_results.md +| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff | +|-------|-------|----------|---------|---------|-----|----------| +| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00390625 | +| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00390625 | + +# 摩尔线程程序执行及性能分析 +```sh +cd mexc +// 进行编译 +$MUSA_ROOT/bin/mcc -O3 -std=c++17 \ +-I$MUSA_ROOT/include \ +-L$MUSA_ROOT/lib \ +-L/usr/lib/gcc/x86_64-linux-gnu/11 \ +-lmusart -lstdc++ \ +nf4_dequant_musa.mu -o nf4_musa +// 提交任务执行权重文件 +sbatch test.sbatch +// 进行性能和误差分析 +python compare_results.py +``` +# 摩尔线程适配结果 +对于S5000对nf4_dequant_v7进行支持,在中大矩阵的表现差,且有效带宽在最大时也只有229GB/s。执行结果权重文件为mexc_results目录下的.fp16为后缀的文件,log文件记录带宽及其执行时间。加速比在mexc/comparison_musa_results.md +| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff | +|-------|-------|----------|-----------|---------|-----|-----|----------| +| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00000030 | 0.00390625 | +| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00000018 | 0.00390625 | +| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00000024 | 0.00390625 | +| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00000012 | 0.00390625 | +| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00000024 | 0.00390625 | +| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00000024 | 0.00390625 | +| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00000018 | 0.00390625 | +| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00000024 | 0.00390625 | +| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00000024 | 0.00390625 | +| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00000024 | 0.00390625 | +| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00000012 | 0.00390625 | +| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00000018 | 0.00390625 | +| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00000018 | 0.00390625 | +| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00000030 | 0.00390625 | +| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00000018 | 0.00390625 | +| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00000018 | 0.00390625 | +| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00000030 | 0.00390625 | +| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00000018 | 0.00390625 | +| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00000024 | 0.00390625 | +| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00000018 | 0.00390625 | + +# 未来可继续提升的地方 +解决v7版本存在的bank conflicts,v7版本在国产GPU上执行较差,找到原因解决。 \ No newline at end of file diff --git "a/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf" "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf" new file mode 100644 index 0000000..9fbec8f Binary files /dev/null and "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf" differ