diff --git a/03_nf4_dequant/ayepei/bnb_benchmark_results.csv b/03_nf4_dequant/ayepei/bnb_benchmark_results.csv
new file mode 100644
index 0000000..27a93e2
--- /dev/null
+++ b/03_nf4_dequant/ayepei/bnb_benchmark_results.csv
@@ -0,0 +1,21 @@
+Shape,Blocksize,BnB Time (ms),Total Elements,Weight File,BnB Output File
+256x256,64,0.1144,65536,weight_data/weight_256x256_bs64.bin,bnb_results/bnb_256x256_bs64.fp16
+256x256,128,0.1064,65536,weight_data/weight_256x256_bs128.bin,bnb_results/bnb_256x256_bs128.fp16
+512x512,64,0.1010,262144,weight_data/weight_512x512_bs64.bin,bnb_results/bnb_512x512_bs64.fp16
+512x512,128,0.0977,262144,weight_data/weight_512x512_bs128.bin,bnb_results/bnb_512x512_bs128.fp16
+1024x1024,64,0.0960,1048576,weight_data/weight_1024x1024_bs64.bin,bnb_results/bnb_1024x1024_bs64.fp16
+1024x1024,128,0.0912,1048576,weight_data/weight_1024x1024_bs128.bin,bnb_results/bnb_1024x1024_bs128.fp16
+2048x2048,64,0.0910,4194304,weight_data/weight_2048x2048_bs64.bin,bnb_results/bnb_2048x2048_bs64.fp16
+2048x2048,128,0.0925,4194304,weight_data/weight_2048x2048_bs128.bin,bnb_results/bnb_2048x2048_bs128.fp16
+4096x4096,64,0.0891,16777216,weight_data/weight_4096x4096_bs64.bin,bnb_results/bnb_4096x4096_bs64.fp16
+4096x4096,128,0.0905,16777216,weight_data/weight_4096x4096_bs128.bin,bnb_results/bnb_4096x4096_bs128.fp16
+8192x8192,64,0.0918,67108864,weight_data/weight_8192x8192_bs64.bin,bnb_results/bnb_8192x8192_bs64.fp16
+8192x8192,128,0.0912,67108864,weight_data/weight_8192x8192_bs128.bin,bnb_results/bnb_8192x8192_bs128.fp16
+16384x16384,64,0.3074,268435456,weight_data/weight_16384x16384_bs64.bin,bnb_results/bnb_16384x16384_bs64.fp16
+16384x16384,128,0.2973,268435456,weight_data/weight_16384x16384_bs128.bin,bnb_results/bnb_16384x16384_bs128.fp16
+3421x3146,64,0.0925,10762466,weight_data/weight_3421x3146_bs64.bin,bnb_results/bnb_3421x3146_bs64.fp16
+3421x3146,128,0.0904,10762466,weight_data/weight_3421x3146_bs128.bin,bnb_results/bnb_3421x3146_bs128.fp16
+6578x1236,64,0.0895,8130408,weight_data/weight_6578x1236_bs64.bin,bnb_results/bnb_6578x1236_bs64.fp16
+6578x1236,128,0.0876,8130408,weight_data/weight_6578x1236_bs128.bin,bnb_results/bnb_6578x1236_bs128.fp16
+7000x7000,64,0.0909,49000000,weight_data/weight_7000x7000_bs64.bin,bnb_results/bnb_7000x7000_bs64.fp16
+7000x7000,128,0.0939,49000000,weight_data/weight_7000x7000_bs128.bin,bnb_results/bnb_7000x7000_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log
new file mode 100644
index 0000000..02821c0
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs128.bin
+rows=1024
+cols=1024
+blocksize=128
+total_elements=1048576
+kernel_time_ms=0.0030
+bandwidth_gbps=816.30
+output_file=cuda_results/dequant_1024x1024_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log
new file mode 100644
index 0000000..d849144
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs64.bin
+rows=1024
+cols=1024
+blocksize=64
+total_elements=1048576
+kernel_time_ms=0.0029
+bandwidth_gbps=845.08
+output_file=cuda_results/dequant_1024x1024_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log
new file mode 100644
index 0000000..be9a2b9
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs128.bin
+rows=16384
+cols=16384
+blocksize=128
+total_elements=268435456
+kernel_time_ms=0.2102
+bandwidth_gbps=2991.95
+output_file=cuda_results/dequant_16384x16384_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log
new file mode 100644
index 0000000..ecf536e
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs64.bin
+rows=16384
+cols=16384
+blocksize=64
+total_elements=268435456
+kernel_time_ms=0.2105
+bandwidth_gbps=2987.26
+output_file=cuda_results/dequant_16384x16384_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log
new file mode 100644
index 0000000..d5977b1
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs128.bin
+rows=2048
+cols=2048
+blocksize=128
+total_elements=4194304
+kernel_time_ms=0.0049
+bandwidth_gbps=2017.74
+output_file=cuda_results/dequant_2048x2048_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log
new file mode 100644
index 0000000..77d3f61
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs64.bin
+rows=2048
+cols=2048
+blocksize=64
+total_elements=4194304
+kernel_time_ms=0.0047
+bandwidth_gbps=2082.46
+output_file=cuda_results/dequant_2048x2048_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log
new file mode 100644
index 0000000..7d15411
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs128.bin
+rows=256
+cols=256
+blocksize=128
+total_elements=65536
+kernel_time_ms=0.0026
+bandwidth_gbps=58.80
+output_file=cuda_results/dequant_256x256_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log
new file mode 100644
index 0000000..1b9f8fc
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs64.bin
+rows=256
+cols=256
+blocksize=64
+total_elements=65536
+kernel_time_ms=0.0026
+bandwidth_gbps=58.80
+output_file=cuda_results/dequant_256x256_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log
new file mode 100644
index 0000000..a4519d0
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs128.bin
+rows=3421
+cols=3146
+blocksize=128
+total_elements=10762466
+kernel_time_ms=0.0086
+bandwidth_gbps=2939.31
+output_file=cuda_results/dequant_3421x3146_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log
new file mode 100644
index 0000000..63b378d
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs64.bin
+rows=3421
+cols=3146
+blocksize=64
+total_elements=10762466
+kernel_time_ms=0.0086
+bandwidth_gbps=2935.13
+output_file=cuda_results/dequant_3421x3146_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log
new file mode 100644
index 0000000..7740d29
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs128.bin
+rows=4096
+cols=4096
+blocksize=128
+total_elements=16777216
+kernel_time_ms=0.0139
+bandwidth_gbps=2832.19
+output_file=cuda_results/dequant_4096x4096_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log
new file mode 100644
index 0000000..2901aa9
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs64.bin
+rows=4096
+cols=4096
+blocksize=64
+total_elements=16777216
+kernel_time_ms=0.0138
+bandwidth_gbps=2840.68
+output_file=cuda_results/dequant_4096x4096_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log
new file mode 100644
index 0000000..80257fa
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs128.bin
+rows=512
+cols=512
+blocksize=128
+total_elements=262144
+kernel_time_ms=0.0028
+bandwidth_gbps=221.24
+output_file=cuda_results/dequant_512x512_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log
new file mode 100644
index 0000000..f92896f
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs64.bin
+rows=512
+cols=512
+blocksize=64
+total_elements=262144
+kernel_time_ms=0.0028
+bandwidth_gbps=221.82
+output_file=cuda_results/dequant_512x512_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log
new file mode 100644
index 0000000..d234416
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs128.bin
+rows=6578
+cols=1236
+blocksize=128
+total_elements=8130408
+kernel_time_ms=0.0070
+bandwidth_gbps=2717.80
+output_file=cuda_results/dequant_6578x1236_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log
new file mode 100644
index 0000000..9a36bd8
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs64.bin
+rows=6578
+cols=1236
+blocksize=64
+total_elements=8130408
+kernel_time_ms=0.0072
+bandwidth_gbps=2638.75
+output_file=cuda_results/dequant_6578x1236_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log
new file mode 100644
index 0000000..5599e37
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs128.bin
+rows=7000
+cols=7000
+blocksize=128
+total_elements=49000000
+kernel_time_ms=0.0412
+bandwidth_gbps=2787.30
+output_file=cuda_results/dequant_7000x7000_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log
new file mode 100644
index 0000000..2c36614
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs64.bin
+rows=7000
+cols=7000
+blocksize=64
+total_elements=49000000
+kernel_time_ms=0.0414
+bandwidth_gbps=2771.04
+output_file=cuda_results/dequant_7000x7000_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log
new file mode 100644
index 0000000..128224f
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs128.bin
+rows=8192
+cols=8192
+blocksize=128
+total_elements=67108864
+kernel_time_ms=0.0552
+bandwidth_gbps=2850.08
+output_file=cuda_results/dequant_8192x8192_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log
new file mode 100644
index 0000000..cc998ca
--- /dev/null
+++ b/03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs64.bin
+rows=8192
+cols=8192
+blocksize=64
+total_elements=67108864
+kernel_time_ms=42.5944
+bandwidth_gbps=3.69
+output_file=cuda_results/dequant_8192x8192_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py b/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py
new file mode 100644
index 0000000..2ef8f59
--- /dev/null
+++ b/03_nf4_dequant/ayepei/generate_and_benchmark_bnb.py
@@ -0,0 +1,284 @@
+import torch
+import bitsandbytes.functional as F
+import numpy as np
+import struct
+import os
+import csv
+from pathlib import Path
+
+# 保存权重数据到文件
+def save_weight_data(filename, rows, cols, blocksize, 
+                     packed, absmax_q, absmax2, code2, offset):
+    """
+    保存权重数据到二进制文件
+    格式：
+    [header] 
+        num_rows: int64
+        num_cols: int64  
+        blocksize: int32
+    [data]
+        packed_weights: uint8[num_rows * num_cols / 2]
+        absmax_q: uint8[num_blocks]
+        absmax2: float16[num_groups]
+        code2: float16[256]
+        offset: float32
+    """
+    with open(filename, 'wb') as f:
+        # 写入 header
+        f.write(struct.pack('q', rows))           # num_rows (int64)
+        f.write(struct.pack('q', cols))           # num_cols (int64)
+        f.write(struct.pack('i', blocksize))      # blocksize (int32)
+        
+        # 写入 packed_weights (uint8)
+        packed_np = packed.cpu().numpy().astype(np.uint8)
+        f.write(packed_np.tobytes())
+        
+        # 写入 absmax_q (uint8)
+        absmax_q_np = absmax_q.cpu().numpy().astype(np.uint8)
+        f.write(absmax_q_np.tobytes())
+        
+        # 写入 absmax2 (float16)
+        absmax2_np = absmax2.cpu().numpy().astype(np.float16)
+        f.write(absmax2_np.tobytes())
+        
+        # 写入 code2 (float16[256]) - 确保长度为256
+        code2_np = code2.cpu().numpy().astype(np.float16)
+        if len(code2_np) < 256:
+            code2_padded = np.zeros(256, dtype=np.float16)
+            code2_padded[:len(code2_np)] = code2_np
+        else:
+            code2_padded = code2_np[:256]
+        f.write(code2_padded.tobytes())
+        
+        # 写入 offset (float32)
+        f.write(struct.pack('f', offset))
+    
+    file_size = os.path.getsize(filename)
+    print(f"   权重文件已保存: {filename} ({file_size/1024:.2f} KB)")
+
+# 保存 bitsandbytes 的解量化结果
+def save_bnb_output(filename, output_tensor, rows, cols):
+    """
+    保存 bitsandbytes 的解量化结果
+    格式：float16 二进制文件，按行主序存储
+    """
+    output_np = output_tensor.cpu().numpy().astype(np.float16)
+    output_np.tofile(filename)
+    file_size = os.path.getsize(filename)
+    print(f"   BnB 结果已保存: {filename} ({file_size/1024:.2f} KB)")
+
+# =========================================================
+# 运行 bitsandbytes 解量化并计时
+# =========================================================
+def run_bnb_dequant(packed, state, test_iters=100):
+    """
+    运行 bitsandbytes 解量化，返回结果和平均执行时间
+    """
+    print(f"   运行 bitsandbytes 解量化...")
+    
+    # 预热
+    for _ in range(10):
+        out_ref = F.dequantize_4bit(packed, state)
+    
+    torch.cuda.synchronize()
+    
+    # 计时
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    
+    start.record()
+    for _ in range(test_iters):
+        out_ref = F.dequantize_4bit(packed, state)
+    end.record()
+    
+    torch.cuda.synchronize()
+    bnb_time = start.elapsed_time(end) / test_iters
+    
+    # 最后一次结果用于保存
+    final_output = F.dequantize_4bit(packed, state)
+    
+    print(f"   BnB 完成, 平均时间: {bnb_time:.4f} ms")
+    
+    return final_output, bnb_time
+
+# =========================================================
+# 生成并保存单个测试用例
+# =========================================================
+def generate_and_test(rows, cols, blocksize, group_size=256, 
+                      save_dir="weight_data", bnb_dir="bnb_results"):
+    """
+    生成一个测试用例，运行 bitsandbytes，保存所有需要的数据
+    """
+    total = rows * cols
+    
+    print(f"\n 处理 {rows}x{cols} 矩阵, blocksize={blocksize}...")
+    
+    # 创建保存目录
+    Path(save_dir).mkdir(exist_ok=True)
+    Path(bnb_dir).mkdir(exist_ok=True)
+    
+    # 创建权重数据（在 GPU 上）
+    weight = torch.randn(rows, cols, device="cuda", dtype=torch.float16)
+
+    # 量化
+    packed, state = F.quantize_4bit(
+        weight,
+        blocksize=blocksize,
+        quant_type="nf4",
+        compress_statistics=True
+    )
+
+    # 获取量化参数
+    absmax_q = state.absmax.contiguous()
+    absmax2 = state.state2.absmax.to(torch.float16).contiguous()
+    code2 = state.state2.code.to(torch.float16).contiguous()
+    offset = float(state.offset)
+
+    # 保存权重文件（供 CUDA 程序读取）
+    weight_file = f"{save_dir}/weight_{rows}x{cols}_bs{blocksize}.bin"
+    save_weight_data(weight_file, rows, cols, blocksize,
+                    packed, absmax_q, absmax2, code2, offset)
+    
+    # 运行 bitsandbytes 并计时
+    bnb_output, bnb_time = run_bnb_dequant(packed, state)
+    
+    # 保存 bitsandbytes 的解量化结果
+    bnb_file = f"{bnb_dir}/bnb_{rows}x{cols}_bs{blocksize}.fp16"
+    save_bnb_output(bnb_file, bnb_output, rows, cols)
+    
+
+    
+    return {
+        'shape': f"{rows}x{cols}",
+        'rows': rows,
+        'cols': cols,
+        'blocksize': blocksize,
+        'weight_file': weight_file,
+        'bnb_file': bnb_file,
+        'bnb_time_ms': bnb_time,
+        'total_elements': total
+    }
+
+
+# 生成所有测试用例并保存结果
+def generate_all():
+    """
+    生成所有测试用例，运行 bitsandbytes，保存结果
+    """
+    shapes = [
+        (256, 256),
+        (512, 512),
+        (1024, 1024),
+        (2048, 2048),
+        (4096, 4096),
+        (8192, 8192),
+        (16384, 16384),
+        (3421, 3146),
+        (6578, 1236),
+        (7000, 7000),
+    ]
+
+    blocksizes = [64, 128]
+    
+    save_dir = "weight_data"
+    bnb_dir = "bnb_results"
+    
+    print("=" * 70)
+    print("NF4 数据生成和 bitsandbytes 基准测试")
+    print("=" * 70)
+    print(f"权重文件保存目录: {save_dir}/")
+    print(f"BnB 结果保存目录: {bnb_dir}/")
+    print()
+    
+    results = []
+    
+    for rows, cols in shapes:
+        for block in blocksizes:
+            info = generate_and_test(rows, cols, block, 
+                                    save_dir=save_dir, bnb_dir=bnb_dir)
+            results.append(info)
+    
+    # 保存汇总结果到 CSV
+    csv_file = "bnb_benchmark_results.csv"
+    with open(csv_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Shape', 'Blocksize', 'BnB Time (ms)', 
+                        'Total Elements', 'Weight File', 'BnB Output File'])
+        
+        for r in results:
+            writer.writerow([
+                r['shape'],
+                r['blocksize'],
+                f"{r['bnb_time_ms']:.4f}",
+                r['total_elements'],
+                r['weight_file'],
+                r['bnb_file']
+            ])
+    
+    print("\n" + "=" * 70)
+    print(" 生成完成!")
+    print("=" * 70)
+    print(f"\n BnB 基准测试结果已保存到: {csv_file}")
+    print("\n生成的目录结构:")
+    print(f"  {save_dir}/ - 包含以下权重文件（供 CUDA 程序使用）:")
+    for r in results:
+        print(f"    - weight_{r['shape']}_bs{r['blocksize']}.bin")
+    
+    print(f"\n  {bnb_dir}/ - 包含以下结果文件（用于对比）:")
+    for r in results:
+        print(f"    - bnb_{r['shape']}_bs{r['blocksize']}.fp16")
+        print(f"    - original_{r['shape']}_bs{r['blocksize']}.fp16")
+    
+    print("\n" + "=" * 70)
+    print("BnB 执行时间汇总:")
+    print("-" * 70)
+    print(f"{'Shape':<12} {'Block':<6} {'Time (ms)':<12}")
+    print("-" * 70)
+    for r in results:
+        print(f"{r['shape']:<12} {r['blocksize']:<6} {r['bnb_time_ms']:<12.4f}")
+    print("=" * 70)
+
+# 生成单个测试用例
+def generate_single(rows=1024, cols=1024, blocksize=64):
+    """
+    生成单个测试用例
+    """
+    save_dir = "weight_data"
+    bnb_dir = "bnb_results"
+    Path(save_dir).mkdir(exist_ok=True)
+    Path(bnb_dir).mkdir(exist_ok=True)
+    
+    info = generate_and_test(rows, cols, blocksize, 
+                            save_dir=save_dir, bnb_dir=bnb_dir)
+    
+    print("\n" + "=" * 50)
+    print("单个测试用例生成完成")
+    print("=" * 50)
+    print(f"权重文件: {info['weight_file']}")
+    print(f"BnB 结果: {info['bnb_file']}")
+    print(f"原始权重: {info['original_file']}")
+    print(f"BnB 执行时间: {info['bnb_time_ms']:.4f} ms")
+    
+    return info
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='生成 NF4 测试数据并运行 bitsandbytes')
+    parser.add_argument('--mode', type=str, default='all',
+                       choices=['all', 'single'],
+                       help='运行模式: all (所有配置) 或 single (单个配置)')
+    parser.add_argument('--rows', type=int, default=1024,
+                       help='矩阵行数 (single 模式)')
+    parser.add_argument('--cols', type=int, default=1024,
+                       help='矩阵列数 (single 模式)')
+    parser.add_argument('--blocksize', type=int, default=64,
+                       help='块大小 (single 模式)')
+    
+    args = parser.parse_args()
+    
+    if args.mode == 'all':
+        generate_all()
+    else:
+        generate_single(args.rows, args.cols, args.blocksize)
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/img/v3_source_1.png b/03_nf4_dequant/ayepei/img/v3_source_1.png
new file mode 100644
index 0000000..29a92f7
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_source_1.png differ
diff --git a/03_nf4_dequant/ayepei/img/v3_source_2.png b/03_nf4_dequant/ayepei/img/v3_source_2.png
new file mode 100644
index 0000000..db56e95
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_source_2.png differ
diff --git a/03_nf4_dequant/ayepei/img/v3_th.png b/03_nf4_dequant/ayepei/img/v3_th.png
new file mode 100644
index 0000000..4b5039d
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_th.png differ
diff --git a/03_nf4_dequant/ayepei/img/v3_warp.png b/03_nf4_dequant/ayepei/img/v3_warp.png
new file mode 100644
index 0000000..527cba0
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v3_warp.png differ
diff --git a/03_nf4_dequant/ayepei/img/v4_bank.png b/03_nf4_dequant/ayepei/img/v4_bank.png
new file mode 100644
index 0000000..b1c8fea
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_bank.png differ
diff --git a/03_nf4_dequant/ayepei/img/v4_source_1.png b/03_nf4_dequant/ayepei/img/v4_source_1.png
new file mode 100644
index 0000000..4201351
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_source_1.png differ
diff --git a/03_nf4_dequant/ayepei/img/v4_th.png b/03_nf4_dequant/ayepei/img/v4_th.png
new file mode 100644
index 0000000..e22c26e
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v4_th.png differ
diff --git a/03_nf4_dequant/ayepei/img/v5_source_1.png b/03_nf4_dequant/ayepei/img/v5_source_1.png
new file mode 100644
index 0000000..7ea6123
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_source_1.png differ
diff --git a/03_nf4_dequant/ayepei/img/v5_source_2.png b/03_nf4_dequant/ayepei/img/v5_source_2.png
new file mode 100644
index 0000000..ed549f2
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_source_2.png differ
diff --git a/03_nf4_dequant/ayepei/img/v5_th.png b/03_nf4_dequant/ayepei/img/v5_th.png
new file mode 100644
index 0000000..3516548
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_th.png differ
diff --git a/03_nf4_dequant/ayepei/img/v5_warp.png b/03_nf4_dequant/ayepei/img/v5_warp.png
new file mode 100644
index 0000000..1677ffa
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v5_warp.png differ
diff --git a/03_nf4_dequant/ayepei/img/v6_source_1.png b/03_nf4_dequant/ayepei/img/v6_source_1.png
new file mode 100644
index 0000000..1694d0c
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_source_1.png differ
diff --git a/03_nf4_dequant/ayepei/img/v6_source_2.png b/03_nf4_dequant/ayepei/img/v6_source_2.png
new file mode 100644
index 0000000..07a9668
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_source_2.png differ
diff --git a/03_nf4_dequant/ayepei/img/v6_th.png b/03_nf4_dequant/ayepei/img/v6_th.png
new file mode 100644
index 0000000..70c3fa0
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_th.png differ
diff --git a/03_nf4_dequant/ayepei/img/v6_warp.png b/03_nf4_dequant/ayepei/img/v6_warp.png
new file mode 100644
index 0000000..ae761df
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v6_warp.png differ
diff --git a/03_nf4_dequant/ayepei/img/v7_bank.png b/03_nf4_dequant/ayepei/img/v7_bank.png
new file mode 100644
index 0000000..99766d6
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_bank.png differ
diff --git a/03_nf4_dequant/ayepei/img/v7_source_1.png b/03_nf4_dequant/ayepei/img/v7_source_1.png
new file mode 100644
index 0000000..0420b27
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_source_1.png differ
diff --git a/03_nf4_dequant/ayepei/img/v7_source_2.png b/03_nf4_dequant/ayepei/img/v7_source_2.png
new file mode 100644
index 0000000..a245189
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_source_2.png differ
diff --git a/03_nf4_dequant/ayepei/img/v7_th.png b/03_nf4_dequant/ayepei/img/v7_th.png
new file mode 100644
index 0000000..e1f4e1a
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_th.png differ
diff --git a/03_nf4_dequant/ayepei/img/v7_warp.png b/03_nf4_dequant/ayepei/img/v7_warp.png
new file mode 100644
index 0000000..b7942a6
Binary files /dev/null and b/03_nf4_dequant/ayepei/img/v7_warp.png differ
diff --git a/03_nf4_dequant/ayepei/mexc/compare_results.py b/03_nf4_dequant/ayepei/mexc/compare_results.py
new file mode 100644
index 0000000..91f39ed
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/compare_results.py
@@ -0,0 +1,148 @@
+import torch
+import numpy as np
+import pandas as pd
+import csv
+from pathlib import Path
+
+
+def compare_all():
+    """对比所有 musa 和 BnB 的结果"""
+
+    BASE = Path("..")
+
+    MUSA_DIR = BASE / "musa_results"
+    BNB_DIR = BASE / "bnb_results"
+    BNB_CSV = BASE / "bnb_benchmark_results.csv"
+
+    results = []
+
+    # 读取 BnB benchmark CSV
+    if not BNB_CSV.exists():
+        print(f" 找不到 {BNB_CSV}")
+        return
+
+    bnb_data = {}
+    with open(BNB_CSV, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            key = f"{row['Shape']}_bs{row['Blocksize']}"
+            bnb_data[key] = {
+                'time_ms': float(row['BnB Time (ms)']),
+                'bnb_file': row['BnB Output File']
+            }
+
+    # ==================================================
+    # 查找所有 MUSA 结果
+    # ==================================================
+    musa_files = list(MUSA_DIR.glob("dequant_*.fp16"))
+
+    print("\n" + "="*80)
+    print("对比结果汇总")
+    print("="*80)
+    print(f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'MUSA (ms)':<12} "
+          f"{'Speedup':<8} {'MAE':<12} {'Max Diff':<12}")
+    print("-"*80)
+
+    for musa_file in musa_files:
+        filename = musa_file.name
+
+        parts = filename.replace('dequant_', '').replace('.fp16', '').split('_bs')
+        shape = parts[0]
+        blocksize = parts[1]
+
+        # 读取 MUSA 输出
+        musa_data = np.fromfile(musa_file, dtype=np.float16)
+        musa_tensor = torch.from_numpy(musa_data)
+
+        # 读取 BnB 输出
+        bnb_file = BNB_DIR / f"bnb_{shape}_bs{blocksize}.fp16"
+        if not bnb_file.exists():
+            print(f" 找不到 BnB 文件: {bnb_file}")
+            continue
+
+        bnb_data_np = np.fromfile(bnb_file, dtype=np.float16)
+        bnb_tensor = torch.from_numpy(bnb_data_np)
+
+
+
+        # 误差计算
+        mae = torch.mean(torch.abs(bnb_tensor - musa_tensor)).item()
+        mse = torch.mean((bnb_tensor - musa_tensor) ** 2).item()
+        max_diff = torch.max(torch.abs(bnb_tensor - musa_tensor)).item()
+
+        # 时间读取
+        key = f"{shape}_bs{blocksize}"
+        bnb_time = bnb_data[key]['time_ms']
+
+        log_file = MUSA_DIR / f"perf_{shape}_bs{blocksize}.log"
+        musa_time = 0
+
+        if log_file.exists():
+            with open(log_file, 'r') as f:
+                for line in f:
+                    if 'kernel_time_ms' in line:
+                        musa_time = float(line.strip().split('=')[1])
+
+        speedup = bnb_time / musa_time if musa_time > 0 else 0
+
+        print(f"{shape:<12} {blocksize:<6} "
+              f"{bnb_time:<12.4f} {musa_time:<12.4f} "
+              f"{speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}")
+
+        results.append({
+            'shape': shape,
+            'blocksize': int(blocksize),
+            'bnb_time_ms': bnb_time,
+            'musa_time_ms': musa_time,
+            'speedup': speedup,
+            'mae': mae,
+            'mse': mse,
+            'max_diff': max_diff,
+        })
+
+    print("="*80)
+
+
+    # 保存 CSV
+    df = pd.DataFrame(results)
+    out_csv =  "comparison_musa_results.csv"
+    float_format = '%.8f' 
+    df.to_csv(out_csv, index=False, float_format=float_format)
+
+    print(f"\n 对比结果已保存到: {out_csv}")
+    md_file = "comparison_musa_results.md"
+    with open(md_file, 'w', encoding='utf-8') as f:
+        # 写入表格标题
+        f.write("# MUSA vs BnB 对比结果\n\n")
+        
+        # 写入表头
+        f.write("| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | Max Diff |\n")
+        f.write("|-------|-------|----------|-----------|---------|-----|----------|\n")
+        
+        # 写入数据
+        for _, row in df.iterrows():
+            line = (f"| {row['shape']} | {row['blocksize']} | "
+                   f"{row['bnb_time_ms']:.4f} | {row['musa_time_ms']:.4f} | "
+                   f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |")
+            f.write(line + '\n')
+        
+        # 添加详细数据表
+        f.write("\n\n## 详细数据\n\n")
+        f.write("| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff |\n")
+        f.write("|-------|-------|----------|-----------|---------|-----|-----|----------|\n")
+        
+        for _, row in df.iterrows():
+            line = (f"| {row['shape']} | {row['blocksize']} | "
+                   f"{row['bnb_time_ms']:.4f} | {row['musa_time_ms']:.4f} | "
+                   f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | "
+                   f"{row['max_diff']:.8f} | ")
+            f.write(line + '\n')
+    
+    print(f" Markdown表格已保存到: {md_file}")
+
+    return df
+
+
+
+if __name__ == "__main__":
+    compare_all()
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv
new file mode 100644
index 0000000..c05f623
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.csv
@@ -0,0 +1,21 @@
+shape,blocksize,bnb_time_ms,musa_time_ms,speedup,mae,mse,max_diff
+2048x2048,128,0.09250000,0.04350000,2.12643678,0.00029397,0.00000030,0.00390625
+512x512,64,0.10100000,0.00800000,12.62500000,0.00018466,0.00000018,0.00390625
+256x256,64,0.11440000,0.00740000,15.45945946,0.00022900,0.00000024,0.00390625
+3421x3146,64,0.09250000,0.11680000,0.79195205,0.00017142,0.00000012,0.00390625
+7000x7000,64,0.09090000,0.55810000,0.16287404,0.00024140,0.00000024,0.00390625
+7000x7000,128,0.09390000,0.53600000,0.17518657,0.00025988,0.00000024,0.00390625
+2048x2048,64,0.09100000,0.04560000,1.99561404,0.00022352,0.00000018,0.00390625
+6578x1236,128,0.08760000,0.08320000,1.05288462,0.00026894,0.00000024,0.00390625
+1024x1024,128,0.09120000,0.01370000,6.65693431,0.00024605,0.00000024,0.00390625
+256x256,128,0.10640000,0.00770000,13.81818182,0.00024271,0.00000024,0.00390625
+8192x8192,64,0.09180000,0.77260000,0.11881957,0.00017440,0.00000012,0.00390625
+4096x4096,64,0.08910000,0.18320000,0.48635371,0.00018179,0.00000018,0.00390625
+8192x8192,128,0.09120000,0.74250000,0.12282828,0.00022542,0.00000018,0.00390625
+1024x1024,64,0.09600000,0.01450000,6.62068966,0.00028801,0.00000030,0.00390625
+6578x1236,64,0.08950000,0.08670000,1.03229527,0.00022066,0.00000018,0.00390625
+16384x16384,64,0.30740000,3.30830000,0.09291781,0.00019693,0.00000018,0.00390625
+512x512,128,0.09770000,0.00780000,12.52564103,0.00029063,0.00000030,0.00390625
+4096x4096,128,0.09050000,0.17560000,0.51537585,0.00020349,0.00000018,0.00390625
+3421x3146,128,0.09040000,0.11030000,0.81958296,0.00026965,0.00000024,0.00390625
+16384x16384,128,0.29730000,3.18830000,0.09324719,0.00022352,0.00000018,0.00390625
diff --git a/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md
new file mode 100644
index 0000000..7eef9bd
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/comparison_musa_results.md
@@ -0,0 +1,50 @@
+# MUSA vs BnB 对比结果
+
+| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00390625 |
+| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00390625 |
+
+
+## 详细数据
+
+| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|-----|----------|
+| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00000030 | 0.00390625 | 
+| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00000018 | 0.00390625 | 
+| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00000024 | 0.00390625 | 
+| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00000012 | 0.00390625 | 
+| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00000024 | 0.00390625 | 
+| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00000024 | 0.00390625 | 
+| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00000018 | 0.00390625 | 
+| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00000024 | 0.00390625 | 
+| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00000024 | 0.00390625 | 
+| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00000024 | 0.00390625 | 
+| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00000012 | 0.00390625 | 
+| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00000018 | 0.00390625 | 
+| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00000018 | 0.00390625 | 
+| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00000030 | 0.00390625 | 
+| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00000018 | 0.00390625 | 
+| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00000018 | 0.00390625 | 
+| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00000030 | 0.00390625 | 
+| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00000018 | 0.00390625 | 
+| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00000024 | 0.00390625 | 
+| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00000018 | 0.00390625 | 
diff --git a/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu b/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu
new file mode 100644
index 0000000..684d960
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/nf4_dequant_musa.mu
@@ -0,0 +1,372 @@
+#include <musa_fp16.h>
+#include <musa_runtime.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#define MUSA_CHECK(call)                                                       \
+  do {                                                                         \
+    musaError_t err = call;                                                   \
+    if (err != musaSuccess) {                                                 \
+      fprintf(stderr, "MUSA error at %s:%d - %s\n", __FILE__, __LINE__,        \
+              musaGetErrorString(err));                                       \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// 将表放入到常量内存加快速度
+__constant__ __half NF4_LUT_HALF[16];
+__constant__ __half CODE2_LUT[256];
+
+void ensure_directory_exists(const char *path) {
+  struct stat st = {0};
+  if (stat(path, &st) == -1) {
+#ifdef _WIN32
+    _mkdir(path);
+#else
+    mkdir(path, 0755);
+#endif
+  }
+}
+
+// 初始化 LUT 将 float 转换为 half
+void init_nf4_lut() {
+  float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f,
+                     -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f,
+                     0.07958030f,  0.16093020f,  0.24611230f,  0.33791524f,
+                     0.44070983f,  0.56261700f,  0.72295684f,  1.00000000f};
+
+  __half lut_h[16];
+  for (int i = 0; i < 16; i++) {
+    lut_h[i] = __float2half(lut_f[i]);
+  }
+  MUSA_CHECK(musaMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h)));
+}
+
+
+__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed,
+                               const uint8_t *__restrict__ absmax_q,
+                               const __half *__restrict__ absmax2, float offset,
+                               int64_t total_elements, int blocksize,
+                               int group_size, __half *__restrict__ output) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t total_bytes = total_elements >> 1; // 2 half / byte
+  int64_t byte_idx = (int64_t)tid * 4;
+  __shared__ __half s_nf4[16];
+
+  if(threadIdx.x<16){
+    s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x];
+  }
+  if (byte_idx >= total_bytes)
+    return;
+  uint32_t pack4 = ((const uint32_t *)packed)[tid];
+  uint8_t b0 = pack4 & 0xFF;
+  uint8_t b1 = (pack4 >> 8) & 0xFF;
+  uint8_t b2 = (pack4 >> 16) & 0xFF;
+  uint8_t b3 = (pack4 >> 24) & 0xFF;
+  int64_t half_base = byte_idx << 1;
+  int block_idx = half_base / blocksize;
+  int group_idx = block_idx / group_size;
+  __half scale =
+      __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]),
+             __float2half(offset));
+  __half h[8];
+  h[0] = __hmul(s_nf4[(b0) >> 4], scale);
+  h[1] = __hmul(s_nf4[(b0) & 0xF], scale);
+  h[2] = __hmul(s_nf4[(b1) >> 4], scale);
+  h[3] = __hmul(s_nf4[(b1) & 0xF], scale);
+  h[4] = __hmul(s_nf4[(b2) >> 4], scale);
+  h[5] = __hmul(s_nf4[(b2) & 0xF], scale);
+  h[6] = __hmul(s_nf4[(b3) >> 4], scale);
+  h[7] = __hmul(s_nf4[(b3) & 0xF], scale);
+
+  uint4 out_pack;
+  reinterpret_cast<__half *>(&out_pack)[0] = h[0];
+  reinterpret_cast<__half *>(&out_pack)[1] = h[1];
+  reinterpret_cast<__half *>(&out_pack)[2] = h[2];
+  reinterpret_cast<__half *>(&out_pack)[3] = h[3];
+  reinterpret_cast<__half *>(&out_pack)[4] = h[4];
+  reinterpret_cast<__half *>(&out_pack)[5] = h[5];
+  reinterpret_cast<__half *>(&out_pack)[6] = h[6];
+  reinterpret_cast<__half *>(&out_pack)[7] = h[7];
+
+  ((uint4 *)(output + half_base))[0] = out_pack;
+}
+// 读取权重文件
+int read_weight_file(const char *filename, int64_t *rows, int64_t *cols,
+                     int *blocksize, uint8_t **packed, uint8_t **absmax_q,
+                     __half **absmax2, __half **code2, float *offset) {
+
+  FILE *fp = fopen(filename, "rb");
+  if (!fp) {
+    fprintf(stderr, "无法打开文件: %s\n", filename);
+    return -1;
+  }
+
+  // 读取 header
+  fread(rows, sizeof(int64_t), 1, fp);
+  fread(cols, sizeof(int64_t), 1, fp);
+  fread(blocksize, sizeof(int32_t), 1, fp);
+
+  int64_t total_elements = (*rows) * (*cols);
+  int64_t num_packed = (total_elements + 1) / 2;
+  int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  printf("\n 文件信息:\n");
+  printf("  矩阵: %ld x %ld\n", *rows, *cols);
+  printf("  总元素数: %ld\n", total_elements);
+  printf("  blocksize: %d\n", *blocksize);
+  printf("  打包数据大小: %ld bytes\n", num_packed);
+  printf("  量化块数: %ld\n", num_blocks);
+  printf("  分组数: %ld\n", num_groups);
+
+  // 分配内存
+  *packed = (uint8_t *)malloc(num_packed);
+  *absmax_q = (uint8_t *)malloc(num_blocks);
+  *absmax2 = (__half *)malloc(num_groups * sizeof(__half));
+  *code2 = (__half *)malloc(256 * sizeof(__half));
+
+  if (!*packed || !*absmax_q || !*absmax2 || !*code2) {
+    fprintf(stderr, " 主机内存分配失败\n");
+    fclose(fp);
+    return -1;
+  }
+
+  // 读取数据
+  fread(*packed, 1, num_packed, fp);
+  fread(*absmax_q, 1, num_blocks, fp);
+  fread(*absmax2, sizeof(__half), num_groups, fp);
+  fread(*code2, sizeof(__half), 256, fp);
+  fread(offset, sizeof(float), 1, fp);
+
+  fclose(fp);
+  printf(" 文件读取成功\n");
+  return 0;
+}
+
+// 保存解量化后的权重（自动保存到musa_results目录）
+void save_dequantized_weight(const char *filename, __half *weight,
+                             int64_t total_elements) {
+  ensure_directory_exists("../musa_results");
+
+  // 构建完整路径
+  char full_path[512];
+  snprintf(full_path, sizeof(full_path), "../musa_results/%s", filename);
+
+  FILE *fp = fopen(full_path, "wb");
+  if (!fp) {
+    fprintf(stderr, " 无法创建输出文件: %s\n", full_path);
+    return;
+  }
+
+  fwrite(weight, sizeof(__half), total_elements, fp);
+  fclose(fp);
+
+  printf(" 已保存解量化结果: %s (%.2f MB)\n", full_path,
+         (total_elements * sizeof(__half)) / (1024.0 * 1024.0));
+}
+
+// 计时器 (毫秒)
+double get_time_ms() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
+}
+
+
+// 计算有效内存带宽
+double calculate_bandwidth(int64_t total_elements, double time_ms) {
+  // 输入数据大小
+  int64_t input_bytes = (total_elements + 1) / 2; // packed
+  input_bytes += (total_elements + 64 - 1) / 64;  // absmax_q (近似)
+  input_bytes +=
+      ((total_elements + 64 - 1) / 64 + 255) / 256 * sizeof(__half); // absmax2
+  input_bytes += 256 * sizeof(__half);                               // code2
+
+  // 输出数据大小
+  int64_t output_bytes = total_elements * sizeof(__half);
+
+  int64_t total_bytes = input_bytes + output_bytes;
+
+  return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0);
+}
+
+
+int main(int argc, char **argv) {
+
+  if (argc != 2) {
+    printf("\n使用方法: %s <权重文件.bin>\n", argv[0]);
+    printf("  权重文件格式: 由 Python 脚本生成的 .bin 文件\n");
+    printf("  示例: %s weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]);
+    return -1;
+  }
+
+  const char *input_file = argv[1];
+
+  // 确保输出目录存在
+  ensure_directory_exists("../musa_results");
+
+  // 初始化 LUT
+  init_nf4_lut();
+
+  // 读取权重文件
+  printf("\n 读取权重文件: %s\n", input_file);
+  int64_t rows, cols;
+  int blocksize;
+  uint8_t *h_packed, *h_absmax_q;
+  __half *h_absmax2, *h_code2;
+  float offset;
+
+  if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed,
+                       &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) {
+    return -1;
+  }
+
+  int64_t total_elements = rows * cols;
+  int64_t num_units = (total_elements + 1) / 2; // 每个 uint8 包含两个 half
+
+  // 计算 GPU 内存大小
+  int64_t num_blocks = (total_elements + blocksize - 1) / blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  printf("\n 计算参数:\n");
+  printf("  total_elements: %ld\n", total_elements);
+  printf("  num_units: %ld\n", num_units);
+  printf("  num_blocks: %ld\n", num_blocks);
+  printf("  num_groups: %ld\n", num_groups);
+
+  // 分配 GPU 内存
+  printf("\n 分配 MUSA 内存...\n");
+  uint8_t *d_packed, *d_absmax_q;
+  __half *d_absmax2, *d_code2, *d_output;
+
+  MUSA_CHECK(musaMalloc(&d_packed, num_units));
+  MUSA_CHECK(musaMalloc(&d_absmax_q, num_blocks));
+  MUSA_CHECK(musaMalloc(&d_absmax2, num_groups * sizeof(__half)));
+  MUSA_CHECK(musaMalloc(&d_code2, 256 * sizeof(__half)));
+  MUSA_CHECK(musaMalloc(&d_output, total_elements * sizeof(__half)));
+
+  // 拷贝数据到 GPU
+  printf(" 拷贝数据到 MUSA 设备...\n");
+  MUSA_CHECK(musaMemcpy(d_packed, h_packed, num_units, musaMemcpyHostToDevice));
+  MUSA_CHECK(musaMemcpy(d_absmax_q, h_absmax_q, num_blocks, musaMemcpyHostToDevice));
+  MUSA_CHECK(musaMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half),
+                        musaMemcpyHostToDevice));
+  MUSA_CHECK(musaMemcpy(d_code2, h_code2, 256 * sizeof(__half),
+                        musaMemcpyHostToDevice));
+  MUSA_CHECK(musaMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half)));
+
+  // 分配主机输出内存
+  __half *h_output = (__half *)malloc(total_elements * sizeof(__half));
+  if (!h_output) {
+    fprintf(stderr, " 主机输出内存分配失败\n");
+    return -1;
+  }
+
+  // 配置内核启动参数
+  int threads = 256;
+  int64_t total_bytes = total_elements >> 1;
+  int blocks = (total_bytes/4 + threads - 1) / threads;
+
+  printf("\n 内核配置:\n");
+  printf("  blocks: %d\n", blocks);
+  printf("  threads per block: %d\n", threads);
+  printf("  总线程数: %d\n", blocks * threads);
+
+  // 预热 (5次)
+  printf("\n 预热 (5次)...\n");
+  for (int i = 0; i < 5; i++) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset, total_elements, blocksize,
+                                        256, d_output);
+  }
+  MUSA_CHECK(musaDeviceSynchronize());
+
+  // 正式测试 (100次)
+  printf("  性能测试 (100次迭代)...\n");
+  double start_time = get_time_ms();
+
+  for (int i = 0; i < 100; i++) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset, total_elements, blocksize,
+                                        256, d_output);
+  }
+
+  MUSA_CHECK(musaDeviceSynchronize());
+  double end_time = get_time_ms();
+
+  double total_time = end_time - start_time;
+  double avg_time_ms = total_time / 100.0;
+
+  // 计算带宽
+  double bandwidth = calculate_bandwidth(total_elements, avg_time_ms);
+
+  // 拷贝结果回主机
+  printf(" 拷贝结果回主机...\n");
+  MUSA_CHECK(musaMemcpy(h_output, d_output, total_elements * sizeof(__half),
+                        musaMemcpyDeviceToHost));
+
+  // 生成输出文件名
+  char output_file[256];
+  snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows,
+           cols, blocksize);
+
+  // 保存解量化结果（自动保存到musa_results目录）
+  printf("\n 保存解量化结果...\n");
+  save_dequantized_weight(output_file, h_output, total_elements);
+
+  // 生成性能日志文件名
+  char log_file[256];
+  snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols,
+           blocksize);
+
+  // 输出性能结果
+  printf("输入文件: %s\n", input_file);
+  printf("矩阵大小: %ld x %ld\n", rows, cols);
+  printf("总元素数: %ld\n", total_elements);
+  printf("数据大小: %.2f MB\n",
+         total_elements * sizeof(__half) / (1024.0 * 1024.0));
+  printf("\n");
+  printf("核函数执行时间: %.4f ms\n", avg_time_ms);
+  printf("有效内存带宽: %.2f GB/s\n", bandwidth);
+  printf("\n");
+  printf("输出文件: musa_results/%s\n", output_file);
+  printf("日志文件: musa_results/%s\n", log_file);
+
+  // 保存性能日志（也保存到musa_results目录）
+  char log_path[512];
+  snprintf(log_path, sizeof(log_path), "../musa_results/%s", log_file);
+
+  FILE *log_fp = fopen(log_path, "w");
+  if (log_fp) {
+    fprintf(log_fp, "input_file=%s\n", input_file);
+    fprintf(log_fp, "rows=%ld\n", rows);
+    fprintf(log_fp, "cols=%ld\n", cols);
+    fprintf(log_fp, "blocksize=%d\n", blocksize);
+    fprintf(log_fp, "total_elements=%ld\n", total_elements);
+    fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms);
+    fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth);
+    fprintf(log_fp, "output_file=musa_results/%s\n", output_file);
+    fclose(log_fp);
+    printf(" 性能日志已保存到: %s\n", log_path);
+  }
+
+  free(h_packed);
+  free(h_absmax_q);
+  free(h_absmax2);
+  free(h_code2);
+  free(h_output);
+
+  musaFree(d_packed);
+  musaFree(d_absmax_q);
+  musaFree(d_absmax2);
+  musaFree(d_code2);
+  musaFree(d_output);
+
+  printf("\n 测试完成!\n\n");
+  return 0;
+}
+
diff --git a/03_nf4_dequant/ayepei/mexc/nf4_musa b/03_nf4_dequant/ayepei/mexc/nf4_musa
new file mode 100644
index 0000000..8224952
Binary files /dev/null and b/03_nf4_dequant/ayepei/mexc/nf4_musa differ
diff --git a/03_nf4_dequant/ayepei/mexc/run.md b/03_nf4_dequant/ayepei/mexc/run.md
new file mode 100644
index 0000000..80df44e
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/run.md
@@ -0,0 +1,6 @@
+$MUSA_ROOT/bin/mcc -O3 -std=c++17 \
+-I$MUSA_ROOT/include \
+-L$MUSA_ROOT/lib \
+-L/usr/lib/gcc/x86_64-linux-gnu/11 \
+-lmusart -lstdc++ \
+nf4_dequant_musa.mu -o nf4_musa
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/mexc/test.sbatch b/03_nf4_dequant/ayepei/mexc/test.sbatch
new file mode 100644
index 0000000..f60a0c0
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mexc/test.sbatch
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH --job-name=test_job              # 任务名
+#SBATCH --output=output_%j.log           # 标准输出文件（%j 会替换成 job ID）
+#SBATCH --error=error_%j.log             # 标准错误输出文件
+#SBATCH --partition=mt                   # 分区名（机器系统默认分区是 mt）
+#SBATCH --nodes=1                        # 需要的节点数
+#SBATCH --ntasks=1                       # 总任务数（通常 = 节点数 × 每节点任务数）
+#SBATCH --cpus-per-task=16               # 每个任务需要的 CPU 核心数
+#SBATCH --gres=gpu:mt:1                  # 请求 4 块 GPU（mt 是 Gres 类型）
+#SBATCH --mem=256G                       # 请求的内存
+#SBATCH --time=00:20:00                  # 运行时间上限
+
+
+
+
+
+for f in ../weight_data/*.bin; do
+    srun ./nf4_musa "$f"
+done
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log
new file mode 100644
index 0000000..e8fc29b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs128.bin
+rows=1024
+cols=1024
+blocksize=128
+total_elements=1048576
+kernel_time_ms=0.0139
+bandwidth_gbps=176.89
+output_file=musa_results/dequant_1024x1024_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log
new file mode 100644
index 0000000..ceed4e9
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_1024x1024_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs64.bin
+rows=1024
+cols=1024
+blocksize=64
+total_elements=1048576
+kernel_time_ms=0.0148
+bandwidth_gbps=166.28
+output_file=musa_results/dequant_1024x1024_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log
new file mode 100644
index 0000000..9aaaf21
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs128.bin
+rows=16384
+cols=16384
+blocksize=128
+total_elements=268435456
+kernel_time_ms=3.1883
+bandwidth_gbps=197.26
+output_file=musa_results/dequant_16384x16384_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log
new file mode 100644
index 0000000..b64822d
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_16384x16384_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs64.bin
+rows=16384
+cols=16384
+blocksize=64
+total_elements=268435456
+kernel_time_ms=3.3084
+bandwidth_gbps=190.10
+output_file=musa_results/dequant_16384x16384_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log
new file mode 100644
index 0000000..75659f4
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs128.bin
+rows=2048
+cols=2048
+blocksize=128
+total_elements=4194304
+kernel_time_ms=0.0431
+bandwidth_gbps=227.91
+output_file=musa_results/dequant_2048x2048_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log
new file mode 100644
index 0000000..a861daf
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_2048x2048_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs64.bin
+rows=2048
+cols=2048
+blocksize=64
+total_elements=4194304
+kernel_time_ms=0.0453
+bandwidth_gbps=216.90
+output_file=musa_results/dequant_2048x2048_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log
new file mode 100644
index 0000000..cec8720
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs128.bin
+rows=256
+cols=256
+blocksize=128
+total_elements=65536
+kernel_time_ms=0.0070
+bandwidth_gbps=22.07
+output_file=musa_results/dequant_256x256_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log
new file mode 100644
index 0000000..e3d9d34
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_256x256_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs64.bin
+rows=256
+cols=256
+blocksize=64
+total_elements=65536
+kernel_time_ms=0.0075
+bandwidth_gbps=20.62
+output_file=musa_results/dequant_256x256_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log
new file mode 100644
index 0000000..083f631
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs128.bin
+rows=3421
+cols=3146
+blocksize=128
+total_elements=10762466
+kernel_time_ms=0.1103
+bandwidth_gbps=228.70
+output_file=musa_results/dequant_3421x3146_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log
new file mode 100644
index 0000000..9202688
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_3421x3146_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs64.bin
+rows=3421
+cols=3146
+blocksize=64
+total_elements=10762466
+kernel_time_ms=0.1152
+bandwidth_gbps=218.97
+output_file=musa_results/dequant_3421x3146_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log
new file mode 100644
index 0000000..8b10a1b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs128.bin
+rows=4096
+cols=4096
+blocksize=128
+total_elements=16777216
+kernel_time_ms=0.1756
+bandwidth_gbps=223.87
+output_file=musa_results/dequant_4096x4096_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log
new file mode 100644
index 0000000..d67c5f7
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_4096x4096_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs64.bin
+rows=4096
+cols=4096
+blocksize=64
+total_elements=16777216
+kernel_time_ms=0.1832
+bandwidth_gbps=214.56
+output_file=musa_results/dequant_4096x4096_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log
new file mode 100644
index 0000000..21ab078
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs128.bin
+rows=512
+cols=512
+blocksize=128
+total_elements=262144
+kernel_time_ms=0.0079
+bandwidth_gbps=77.61
+output_file=musa_results/dequant_512x512_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log
new file mode 100644
index 0000000..ea20645
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_512x512_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs64.bin
+rows=512
+cols=512
+blocksize=64
+total_elements=262144
+kernel_time_ms=0.0075
+bandwidth_gbps=82.06
+output_file=musa_results/dequant_512x512_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log
new file mode 100644
index 0000000..2b2ead1
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs128.bin
+rows=6578
+cols=1236
+blocksize=128
+total_elements=8130408
+kernel_time_ms=0.0831
+bandwidth_gbps=229.21
+output_file=musa_results/dequant_6578x1236_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log
new file mode 100644
index 0000000..26988f1
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_6578x1236_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs64.bin
+rows=6578
+cols=1236
+blocksize=64
+total_elements=8130408
+kernel_time_ms=0.0867
+bandwidth_gbps=219.75
+output_file=musa_results/dequant_6578x1236_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log
new file mode 100644
index 0000000..e99804c
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs128.bin
+rows=7000
+cols=7000
+blocksize=128
+total_elements=49000000
+kernel_time_ms=0.5361
+bandwidth_gbps=214.14
+output_file=musa_results/dequant_7000x7000_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log
new file mode 100644
index 0000000..c4b39d5
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_7000x7000_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs64.bin
+rows=7000
+cols=7000
+blocksize=64
+total_elements=49000000
+kernel_time_ms=0.5581
+bandwidth_gbps=205.69
+output_file=musa_results/dequant_7000x7000_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log
new file mode 100644
index 0000000..2dbf46c
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs128.bin
+rows=8192
+cols=8192
+blocksize=128
+total_elements=67108864
+kernel_time_ms=0.7425
+bandwidth_gbps=211.76
+output_file=musa_results/dequant_8192x8192_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log
new file mode 100644
index 0000000..ef7e058
--- /dev/null
+++ b/03_nf4_dequant/ayepei/musa_results/perf_8192x8192_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs64.bin
+rows=8192
+cols=8192
+blocksize=64
+total_elements=67108864
+kernel_time_ms=0.7726
+bandwidth_gbps=203.52
+output_file=musa_results/dequant_8192x8192_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx/compare_results.py b/03_nf4_dequant/ayepei/mx/compare_results.py
new file mode 100644
index 0000000..340efb3
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/compare_results.py
@@ -0,0 +1,131 @@
+import csv
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+
+
+def compare_all():
+    """对比所有 mx 和 BnB 的结果"""
+
+    base = Path("..")
+    mx_dir = base / "mx_results"
+    bnb_dir = base / "bnb_results"
+    bnb_csv = base / "bnb_benchmark_results.csv"
+
+    results = []
+
+    if not bnb_csv.exists():
+        print(f"找不到 {bnb_csv}")
+        return
+
+    bnb_data = {}
+    with open(bnb_csv, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            key = f"{row['Shape']}_bs{row['Blocksize']}"
+            bnb_data[key] = {
+                "time_ms": float(row["BnB Time (ms)"]),
+                "bnb_file": row["BnB Output File"],
+            }
+
+    mx_files = list(mx_dir.glob("dequant_*.fp16"))
+
+    print("\n" + "=" * 90)
+    print("对比结果汇总")
+    print("=" * 90)
+    print(
+        f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'MX (ms)':<12} {'Speedup':<8} {'MAE':<12} {'Max Diff':<12}"
+    )
+    print("-" * 90)
+
+    for mx_file in mx_files:
+        filename = mx_file.name
+        shape, blocksize = (
+            filename.replace("dequant_", "").replace(".fp16", "").split("_bs")
+        )
+
+        mx_tensor = torch.from_numpy(np.fromfile(mx_file, dtype=np.float16))
+
+        bnb_file = bnb_dir / f"bnb_{shape}_bs{blocksize}.fp16"
+        if not bnb_file.exists():
+            print(f"找不到 BnB 文件: {bnb_file}")
+            continue
+
+        bnb_tensor = torch.from_numpy(np.fromfile(bnb_file, dtype=np.float16))
+
+        
+
+        mae = torch.mean(torch.abs(bnb_tensor - mx_tensor)).item()
+        mse = torch.mean((bnb_tensor - mx_tensor) ** 2).item()
+        max_diff = torch.max(torch.abs(bnb_tensor - mx_tensor)).item()
+
+        key = f"{shape}_bs{blocksize}"
+        bnb_time = bnb_data.get(key, {}).get("time_ms", 0.0)
+
+        log_file = mx_dir / f"perf_{shape}_bs{blocksize}.log"
+        mx_time = 0.0
+        if log_file.exists():
+            with open(log_file, "r", encoding="utf-8") as f:
+                for line in f:
+                    if "kernel_time_ms" in line:
+                        mx_time = float(line.strip().split("=")[1])
+
+        speedup = bnb_time / mx_time if mx_time > 0 else 0.0
+
+        print(
+            f"{shape:<12} {blocksize:<6} {bnb_time:<12.4f} {mx_time:<12.4f} {speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}"
+        )
+
+        results.append(
+            {
+                "shape": shape,
+                "blocksize": int(blocksize),
+                "bnb_time_ms": bnb_time,
+                "mx_time_ms": mx_time,
+                "speedup": speedup,
+                "mae": mae,
+                "mse": mse,
+                "max_diff": max_diff,
+            }
+        )
+
+    print("=" * 90)
+
+    if not results:
+        print("没有可用的对比结果。")
+        return
+
+    df = pd.DataFrame(results)
+    out_csv = "comparison_mx_results.csv"
+    df.to_csv(out_csv, index=False, float_format="%.8f")
+    print(f"\n对比结果已保存到: {out_csv}")
+
+    md_file = "comparison_mx_results.md"
+    with open(md_file, "w", encoding="utf-8") as f:
+        f.write("# MX vs BnB 对比结果\n\n")
+        f.write("| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff |\n")
+        f.write("|-------|-------|----------|---------|---------|-----|----------|\n")
+        for _, row in df.iterrows():
+            f.write(
+                f"| {row['shape']} | {row['blocksize']} | {row['bnb_time_ms']:.4f} | {row['mx_time_ms']:.4f} | {row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |\n"
+            )
+
+        f.write("\n\n## 详细数据\n\n")
+        f.write(
+            "| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | MSE | Max Diff |\n"
+        )
+        f.write(
+            "|-------|-------|----------|---------|---------|-----|-----|----------|\n"
+        )
+        for _, row in df.iterrows():
+            f.write(
+                f"| {row['shape']} | {row['blocksize']} | {row['bnb_time_ms']:.4f} | {row['mx_time_ms']:.4f} | {row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | {row['max_diff']:.8f}  |\n"
+            )
+
+    print(f"Markdown 表格已保存到: {md_file}")
+
+
+if __name__ == "__main__":
+    compare_all()
diff --git a/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv b/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv
new file mode 100644
index 0000000..74301be
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/comparison_mx_results.csv
@@ -0,0 +1,21 @@
+shape,blocksize,bnb_time_ms,mx_time_ms,speedup,mae,mse,max_diff
+1024x1024,128,0.09120000,0.02480000,3.67741935,0.00024605,0.00000024,0.00390625
+1024x1024,64,0.09600000,0.01690000,5.68047337,0.00028801,0.00000030,0.00390625
+16384x16384,128,0.29730000,0.92820000,0.32029735,0.00022352,0.00000018,0.00390625
+16384x16384,64,0.30740000,1.01060000,0.30417574,0.00019693,0.00000018,0.00390625
+2048x2048,128,0.09250000,0.03250000,2.84615385,0.00029397,0.00000030,0.00390625
+2048x2048,64,0.09100000,0.02810000,3.23843416,0.00022352,0.00000018,0.00390625
+256x256,128,0.10640000,0.00930000,11.44086022,0.00024271,0.00000024,0.00390625
+256x256,64,0.11440000,0.00940000,12.17021277,0.00022900,0.00000024,0.00390625
+3421x3146,128,0.09040000,0.05090000,1.77603143,0.00026965,0.00000024,0.00390625
+3421x3146,64,0.09250000,0.05170000,1.78916828,0.00017142,0.00000012,0.00390625
+4096x4096,128,0.09050000,0.08130000,1.11316113,0.00020349,0.00000018,0.00390625
+4096x4096,64,0.08910000,0.07350000,1.21224490,0.00018179,0.00000018,0.00390625
+512x512,128,0.09770000,0.01540000,6.34415584,0.00029063,0.00000030,0.00390625
+512x512,64,0.10100000,0.01540000,6.55844156,0.00018466,0.00000018,0.00390625
+6578x1236,128,0.08760000,0.04020000,2.17910448,0.00026894,0.00000024,0.00390625
+6578x1236,64,0.08950000,0.04850000,1.84536082,0.00022066,0.00000018,0.00390625
+7000x7000,128,0.09390000,0.18810000,0.49920255,0.00025988,0.00000024,0.00390625
+7000x7000,64,0.09090000,0.19050000,0.47716535,0.00024140,0.00000024,0.00390625
+8192x8192,128,0.09120000,0.23580000,0.38676845,0.00022542,0.00000018,0.00390625
+8192x8192,64,0.09180000,0.26250000,0.34971429,0.00017440,0.00000012,0.00390625
diff --git a/03_nf4_dequant/ayepei/mx/comparison_mx_results.md b/03_nf4_dequant/ayepei/mx/comparison_mx_results.md
new file mode 100644
index 0000000..6e8680a
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/comparison_mx_results.md
@@ -0,0 +1,50 @@
+# MX vs BnB 对比结果
+
+| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|---------|---------|-----|----------|
+| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00390625 |
+| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00390625 |
+
+
+## 详细数据
+
+| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | MSE | Max Diff |
+|-------|-------|----------|---------|---------|-----|-----|----------|
+| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00000024 | 0.00390625  |
+| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00000030 | 0.00390625  |
+| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00000018 | 0.00390625  |
+| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00000018 | 0.00390625  |
+| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00000030 | 0.00390625  |
+| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00000018 | 0.00390625  |
+| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00000024 | 0.00390625  |
+| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00000024 | 0.00390625  |
+| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00000024 | 0.00390625  |
+| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00000012 | 0.00390625  |
+| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00000018 | 0.00390625  |
+| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00000018 | 0.00390625  |
+| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00000030 | 0.00390625  |
+| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00000018 | 0.00390625  |
+| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00000024 | 0.00390625  |
+| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00000018 | 0.00390625  |
+| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00000024 | 0.00390625  |
+| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00000024 | 0.00390625  |
+| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00000018 | 0.00390625  |
+| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00000012 | 0.00390625  |
diff --git a/03_nf4_dequant/ayepei/mx/nf4_dequant_mx b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx
new file mode 100755
index 0000000..0522008
Binary files /dev/null and b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx differ
diff --git a/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca
new file mode 100644
index 0000000..bdacfa8
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/nf4_dequant_mx.maca
@@ -0,0 +1,366 @@
+#include <maca_fp16.h>
+#include <mc_runtime.h>
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#define GPU_CHECK(call)                                                        \
+  do {                                                                         \
+    mcError_t err = call;                                                    \
+    if (err != mcSuccess) {                                                  \
+      fprintf(stderr, "GPU error at %s:%d - %s\n", __FILE__, __LINE__,       \
+              mcGetErrorString(err));                                        \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+__constant__ __half NF4_LUT_HALF[16];
+__constant__ __half CODE2_LUT[256];
+
+bool read_exact(FILE *fp, void *dst, size_t elem_size, size_t count) {
+  return fread(dst, elem_size, count, fp) == count;
+}
+
+void ensure_directory_exists(const char *path) {
+  struct stat st = {0};
+  if (stat(path, &st) == -1) {
+#ifdef _WIN32
+    mkdir(path);
+#else
+    mkdir(path, 0755);
+#endif
+  }
+}
+
+void init_nf4_lut() {
+  float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f,
+                     -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f,
+                     0.07958030f,  0.16093020f,  0.24611230f,  0.33791524f,
+                     0.44070983f,  0.56261700f,  0.72295684f,  1.00000000f};
+
+  __half lut_h[16];
+  for (int i = 0; i < 16; ++i) {
+    lut_h[i] = __float2half(lut_f[i]);
+  }
+  GPU_CHECK(mcMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h)));
+}
+
+__global__ void nf4_dequant_v6(const uint8_t *__restrict__ packed,
+                               const uint8_t *__restrict__ absmax_q,
+                               const __half *__restrict__ absmax2, float offset,
+                               int64_t total_elements, int blocksize,
+                               int group_size, __half *__restrict__ output) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t total_bytes = total_elements >> 1; // 2 half / byte
+  int64_t byte_idx = (int64_t)tid * 4;
+  if (byte_idx >= total_bytes)
+    return;
+  uint32_t pack4 = ((const uint32_t *)packed)[tid];
+  uint8_t b0 = pack4 & 0xFF;
+  uint8_t b1 = (pack4 >> 8) & 0xFF;
+  uint8_t b2 = (pack4 >> 16) & 0xFF;
+  uint8_t b3 = (pack4 >> 24) & 0xFF;
+  int64_t half_base = byte_idx << 1;
+  int block_idx = half_base / blocksize;
+  int group_idx = block_idx / group_size;
+  __half scale =
+      __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]),
+             __float2half(offset));
+  __half h[8];
+  h[0] = __hmul(NF4_LUT_HALF[(b0) >> 4], scale);
+  h[1] = __hmul(NF4_LUT_HALF[(b0) & 0xF], scale);
+  h[2] = __hmul(NF4_LUT_HALF[(b1) >> 4], scale);
+  h[3] = __hmul(NF4_LUT_HALF[(b1) & 0xF], scale);
+  h[4] = __hmul(NF4_LUT_HALF[(b2) >> 4], scale);
+  h[5] = __hmul(NF4_LUT_HALF[(b2) & 0xF], scale);
+  h[6] = __hmul(NF4_LUT_HALF[(b3) >> 4], scale);
+  h[7] = __hmul(NF4_LUT_HALF[(b3) & 0xF], scale);
+
+  uint4 out_pack;
+  reinterpret_cast<__half *>(&out_pack)[0] = h[0];
+  reinterpret_cast<__half *>(&out_pack)[1] = h[1];
+  reinterpret_cast<__half *>(&out_pack)[2] = h[2];
+  reinterpret_cast<__half *>(&out_pack)[3] = h[3];
+  reinterpret_cast<__half *>(&out_pack)[4] = h[4];
+  reinterpret_cast<__half *>(&out_pack)[5] = h[5];
+  reinterpret_cast<__half *>(&out_pack)[6] = h[6];
+  reinterpret_cast<__half *>(&out_pack)[7] = h[7];
+
+  ((uint4 *)(output + half_base))[0] = out_pack;
+}
+__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed,
+                               const uint8_t *__restrict__ absmax_q,
+                               const __half *__restrict__ absmax2, float offset,
+                               int64_t total_elements, int blocksize,
+                               int group_size, __half *__restrict__ output) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t total_bytes = total_elements >> 1; // 2 half / byte
+  int64_t byte_idx = (int64_t)tid * 4;
+  __shared__ __half s_nf4[16];
+
+  if(threadIdx.x<16){
+    s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x];
+  }
+  if (byte_idx >= total_bytes)
+    return;
+  uint32_t pack4 = ((const uint32_t *)packed)[tid];
+  uint8_t b0 = pack4 & 0xFF;
+  uint8_t b1 = (pack4 >> 8) & 0xFF;
+  uint8_t b2 = (pack4 >> 16) & 0xFF;
+  uint8_t b3 = (pack4 >> 24) & 0xFF;
+  int64_t half_base = byte_idx << 1;
+  int block_idx = half_base / blocksize;
+  int group_idx = block_idx / group_size;
+  __half scale =
+      __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]),
+             __float2half(offset));
+  __half h[8];
+  h[0] = __hmul(s_nf4[(b0) >> 4], scale);
+  h[1] = __hmul(s_nf4[(b0) & 0xF], scale);
+  h[2] = __hmul(s_nf4[(b1) >> 4], scale);
+  h[3] = __hmul(s_nf4[(b1) & 0xF], scale);
+  h[4] = __hmul(s_nf4[(b2) >> 4], scale);
+  h[5] = __hmul(s_nf4[(b2) & 0xF], scale);
+  h[6] = __hmul(s_nf4[(b3) >> 4], scale);
+  h[7] = __hmul(s_nf4[(b3) & 0xF], scale);
+
+  uint4 out_pack;
+  reinterpret_cast<__half *>(&out_pack)[0] = h[0];
+  reinterpret_cast<__half *>(&out_pack)[1] = h[1];
+  reinterpret_cast<__half *>(&out_pack)[2] = h[2];
+  reinterpret_cast<__half *>(&out_pack)[3] = h[3];
+  reinterpret_cast<__half *>(&out_pack)[4] = h[4];
+  reinterpret_cast<__half *>(&out_pack)[5] = h[5];
+  reinterpret_cast<__half *>(&out_pack)[6] = h[6];
+  reinterpret_cast<__half *>(&out_pack)[7] = h[7];
+
+  ((uint4 *)(output + half_base))[0] = out_pack;
+}
+int read_weight_file(const char *filename, int64_t *rows, int64_t *cols,
+                     int *blocksize, uint8_t **packed, uint8_t **absmax_q,
+                     __half **absmax2, __half **code2, float *offset) {
+  FILE *fp = fopen(filename, "rb");
+  if (!fp) {
+    fprintf(stderr, "无法打开文件: %s\n", filename);
+    return -1;
+  }
+
+  if (!read_exact(fp, rows, sizeof(int64_t), 1) ||
+      !read_exact(fp, cols, sizeof(int64_t), 1) ||
+      !read_exact(fp, blocksize, sizeof(int32_t), 1)) {
+    fprintf(stderr, "读取文件头失败: %s\n", filename);
+    fclose(fp);
+    return -1;
+  }
+
+  int64_t total_elements = (*rows) * (*cols);
+  int64_t num_packed = (total_elements + 1) / 2;
+  int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  printf("\n文件信息:\n");
+  printf("  矩阵: %ld x %ld\n", *rows, *cols);
+  printf("  总元素数: %ld\n", total_elements);
+  printf("  blocksize: %d\n", *blocksize);
+  printf("  打包数据大小: %ld bytes\n", num_packed);
+  printf("  量化块数: %ld\n", num_blocks);
+  printf("  分组数: %ld\n", num_groups);
+
+  *packed = (uint8_t *)malloc(num_packed);
+  *absmax_q = (uint8_t *)malloc(num_blocks);
+  *absmax2 = (__half *)malloc(num_groups * sizeof(__half));
+  *code2 = (__half *)malloc(256 * sizeof(__half));
+
+  if (!*packed || !*absmax_q || !*absmax2 || !*code2) {
+    fprintf(stderr, "主机内存分配失败\n");
+    fclose(fp);
+    return -1;
+  }
+
+  if (!read_exact(fp, *packed, 1, num_packed) ||
+      !read_exact(fp, *absmax_q, 1, num_blocks) ||
+      !read_exact(fp, *absmax2, sizeof(__half), num_groups) ||
+      !read_exact(fp, *code2, sizeof(__half), 256) ||
+      !read_exact(fp, offset, sizeof(float), 1)) {
+    fprintf(stderr, "读取量化数据失败: %s\n", filename);
+    fclose(fp);
+    return -1;
+  }
+
+  fclose(fp);
+  printf("文件读取成功\n");
+  return 0;
+}
+
+void save_dequantized_weight(const char *filename, __half *weight,
+                             int64_t total_elements) {
+  ensure_directory_exists("../mx_results");
+
+  char full_path[512];
+  snprintf(full_path, sizeof(full_path), "../mx_results/%s", filename);
+
+  FILE *fp = fopen(full_path, "wb");
+  if (!fp) {
+    fprintf(stderr, "无法创建输出文件: %s\n", full_path);
+    return;
+  }
+
+  fwrite(weight, sizeof(__half), total_elements, fp);
+  fclose(fp);
+
+  printf("已保存解量化结果: %s (%.2f MB)\n", full_path,
+         (total_elements * sizeof(__half)) / (1024.0 * 1024.0));
+}
+
+double get_time_ms() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
+}
+
+double calculate_bandwidth(int64_t total_elements, double time_ms, int blocksize) {
+  int64_t input_bytes = (total_elements + 1) / 2;
+  int64_t num_blocks = (total_elements + blocksize - 1) / blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+  input_bytes += num_blocks;
+  input_bytes += num_groups * sizeof(__half);
+  input_bytes += 256 * sizeof(__half);
+
+  int64_t output_bytes = total_elements * sizeof(__half);
+  int64_t total_bytes = input_bytes + output_bytes;
+
+  return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    printf("\n使用方法: %s <权重文件.bin>\n", argv[0]);
+    printf("示例: %s ../weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]);
+    return -1;
+  }
+
+  const char *input_file = argv[1];
+
+  ensure_directory_exists("../mx_results");
+  init_nf4_lut();
+
+  int64_t rows, cols;
+  int blocksize;
+  uint8_t *h_packed = nullptr;
+  uint8_t *h_absmax_q = nullptr;
+  __half *h_absmax2 = nullptr;
+  __half *h_code2 = nullptr;
+  float offset = 0.f;
+
+  if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed,
+                       &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) {
+    return -1;
+  }
+
+  int64_t total_elements = rows * cols;
+  int64_t num_packed = (total_elements + 1) / 2;
+  int64_t num_blocks = (total_elements + blocksize - 1) / blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  uint8_t *d_packed = nullptr;
+  uint8_t *d_absmax_q = nullptr;
+  __half *d_absmax2 = nullptr;
+  __half *d_code2 = nullptr;
+  __half *d_output = nullptr;
+
+  GPU_CHECK(mcMalloc(&d_packed, num_packed));
+  GPU_CHECK(mcMalloc(&d_absmax_q, num_blocks));
+  GPU_CHECK(mcMalloc(&d_absmax2, num_groups * sizeof(__half)));
+  GPU_CHECK(mcMalloc(&d_code2, 256 * sizeof(__half)));
+  GPU_CHECK(mcMalloc(&d_output, total_elements * sizeof(__half)));
+
+  GPU_CHECK(mcMemcpy(d_packed, h_packed, num_packed, mcMemcpyHostToDevice));
+  GPU_CHECK(
+      mcMemcpy(d_absmax_q, h_absmax_q, num_blocks, mcMemcpyHostToDevice));
+  GPU_CHECK(mcMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half),
+                       mcMemcpyHostToDevice));
+  GPU_CHECK(
+      mcMemcpy(d_code2, h_code2, 256 * sizeof(__half), mcMemcpyHostToDevice));
+  GPU_CHECK(mcMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half)));
+
+  __half *h_output = (__half *)malloc(total_elements * sizeof(__half));
+  if (!h_output) {
+    fprintf(stderr, "主机输出内存分配失败\n");
+    return -1;
+  }
+
+  int threads = 256;
+  int64_t total_bytes = total_elements >> 1;
+  int blocks = (total_bytes / 4 + threads - 1) / threads;
+
+  for (int i = 0; i < 5; ++i) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset,
+                                        total_elements, blocksize, 256,
+                                        d_output);
+  }
+  GPU_CHECK(mcDeviceSynchronize());
+
+  double start_time = get_time_ms();
+  for (int i = 0; i < 100; ++i) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset,
+                                        total_elements, blocksize, 256,
+                                        d_output);
+  }
+  GPU_CHECK(mcDeviceSynchronize());
+  double end_time = get_time_ms();
+
+  double avg_time_ms = (end_time - start_time) / 100.0;
+  double bandwidth = calculate_bandwidth(total_elements, avg_time_ms, blocksize);
+
+  GPU_CHECK(mcMemcpy(h_output, d_output, total_elements * sizeof(__half),
+                       mcMemcpyDeviceToHost));
+
+  char output_file[256];
+  snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows,
+           cols, blocksize);
+  save_dequantized_weight(output_file, h_output, total_elements);
+
+  char log_file[256];
+  snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols,
+           blocksize);
+
+  char log_path[512];
+  snprintf(log_path, sizeof(log_path), "../mx_results/%s", log_file);
+  FILE *log_fp = fopen(log_path, "w");
+  if (log_fp) {
+    fprintf(log_fp, "input_file=%s\n", input_file);
+    fprintf(log_fp, "rows=%ld\n", rows);
+    fprintf(log_fp, "cols=%ld\n", cols);
+    fprintf(log_fp, "blocksize=%d\n", blocksize);
+    fprintf(log_fp, "total_elements=%ld\n", total_elements);
+    fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms);
+    fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth);
+    fprintf(log_fp, "output_file=mx_results/%s\n", output_file);
+    fclose(log_fp);
+  }
+
+  printf("\n输入文件: %s\n", input_file);
+  printf("矩阵大小: %ld x %ld\n", rows, cols);
+  printf("核函数执行时间: %.4f ms\n", avg_time_ms);
+  printf("有效内存带宽: %.2f GB/s\n", bandwidth);
+  printf("输出文件: mx_results/%s\n", output_file);
+
+  free(h_packed);
+  free(h_absmax_q);
+  free(h_absmax2);
+  free(h_code2);
+  free(h_output);
+
+  mcFree(d_packed);
+  mcFree(d_absmax_q);
+  mcFree(d_absmax2);
+  mcFree(d_code2);
+  mcFree(d_output);
+
+  return 0;
+}
diff --git a/03_nf4_dequant/ayepei/mx/run.md b/03_nf4_dequant/ayepei/mx/run.md
new file mode 100644
index 0000000..3bcdb05
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/run.md
@@ -0,0 +1,45 @@
+# 沐曦 NF4 反量化
+
+## 1) 编译
+
+优先使用 `mxcc`（沐曦环境），没有则回退 `nvcc`（CUDA 兼容环境）。
+
+```bash
+cd 03_nf4_dequant/mx
+
+# 方式1：脚本自动选编译器
+bash run_all.sh
+
+# 方式2：手动编译
+mxcc -O3 -std=c++17 -o nf4_dequant_mx nf4_dequant_mx.maca
+
+
+## 2) 单文件运行
+
+```bash
+./nf4_dequant_mx ../weight_data/weight_1024x1024_bs64.bin
+```
+
+输出：
+- 解量化结果：`../mx_results/dequant_<shape>_bs<block>.fp16`
+- 性能日志：`../mx_results/perf_<shape>_bs<block>.log`
+
+## 3) 批量运行
+
+```bash
+bash run_all.sh
+```
+
+
+
+## 4) 与 BnB 结果对比
+
+先保证 `../bnb_results` 与 `../bnb_benchmark_results.csv` 已存在。
+
+```bash
+python compare_results.py
+```
+
+将生成：
+- `comparison_mx_results.csv`
+- `comparison_mx_results.md`
diff --git a/03_nf4_dequant/ayepei/mx/run_all.sh b/03_nf4_dequant/ayepei/mx/run_all.sh
new file mode 100755
index 0000000..41b4097
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx/run_all.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -euo pipefail
+
+if [ ! -f "./nf4_dequant_mx" ]; then
+  echo "编译沐曦 NF4 程序..."
+  if command -v mxcc >/dev/null 2>&1; then
+    mxcc -O3 -std=c++17 -o nf4_dequant_mx nf4_dequant_mx.maca
+  else
+    echo "未找到 mxcc，请先加载沐曦编译环境。"
+    exit 1
+  fi
+fi
+
+for f in ../weight_data/*.bin; do
+  ./nf4_dequant_mx "$f"
+done
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log
new file mode 100644
index 0000000..373191b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs128.bin
+rows=1024
+cols=1024
+blocksize=128
+total_elements=1048576
+kernel_time_ms=0.0248
+bandwidth_gbps=98.89
+output_file=mx_results/dequant_1024x1024_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log
new file mode 100644
index 0000000..4fca2fe
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_1024x1024_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_1024x1024_bs64.bin
+rows=1024
+cols=1024
+blocksize=64
+total_elements=1048576
+kernel_time_ms=0.0169
+bandwidth_gbps=144.99
+output_file=mx_results/dequant_1024x1024_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log
new file mode 100644
index 0000000..4f974bb
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs128.bin
+rows=16384
+cols=16384
+blocksize=128
+total_elements=268435456
+kernel_time_ms=0.9282
+bandwidth_gbps=675.43
+output_file=mx_results/dequant_16384x16384_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log
new file mode 100644
index 0000000..5150ce6
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_16384x16384_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_16384x16384_bs64.bin
+rows=16384
+cols=16384
+blocksize=64
+total_elements=268435456
+kernel_time_ms=1.0106
+bandwidth_gbps=622.36
+output_file=mx_results/dequant_16384x16384_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log
new file mode 100644
index 0000000..eb8695b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs128.bin
+rows=2048
+cols=2048
+blocksize=128
+total_elements=4194304
+kernel_time_ms=0.0325
+bandwidth_gbps=301.44
+output_file=mx_results/dequant_2048x2048_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log
new file mode 100644
index 0000000..84487fa
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_2048x2048_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_2048x2048_bs64.bin
+rows=2048
+cols=2048
+blocksize=64
+total_elements=4194304
+kernel_time_ms=0.0281
+bandwidth_gbps=349.12
+output_file=mx_results/dequant_2048x2048_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log
new file mode 100644
index 0000000..fd8002e
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs128.bin
+rows=256
+cols=256
+blocksize=128
+total_elements=65536
+kernel_time_ms=0.0093
+bandwidth_gbps=16.44
+output_file=mx_results/dequant_256x256_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log
new file mode 100644
index 0000000..fff7366
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_256x256_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_256x256_bs64.bin
+rows=256
+cols=256
+blocksize=64
+total_elements=65536
+kernel_time_ms=0.0094
+bandwidth_gbps=16.44
+output_file=mx_results/dequant_256x256_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log
new file mode 100644
index 0000000..3d4f92b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs128.bin
+rows=3421
+cols=3146
+blocksize=128
+total_elements=10762466
+kernel_time_ms=0.0509
+bandwidth_gbps=494.16
+output_file=mx_results/dequant_3421x3146_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log
new file mode 100644
index 0000000..1ce3918
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_3421x3146_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_3421x3146_bs64.bin
+rows=3421
+cols=3146
+blocksize=64
+total_elements=10762466
+kernel_time_ms=0.0517
+bandwidth_gbps=488.13
+output_file=mx_results/dequant_3421x3146_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log
new file mode 100644
index 0000000..139738b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs128.bin
+rows=4096
+cols=4096
+blocksize=128
+total_elements=16777216
+kernel_time_ms=0.0813
+bandwidth_gbps=482.23
+output_file=mx_results/dequant_4096x4096_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log
new file mode 100644
index 0000000..b793bac
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_4096x4096_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_4096x4096_bs64.bin
+rows=4096
+cols=4096
+blocksize=64
+total_elements=16777216
+kernel_time_ms=0.0735
+bandwidth_gbps=534.60
+output_file=mx_results/dequant_4096x4096_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log
new file mode 100644
index 0000000..fa81358
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs128.bin
+rows=512
+cols=512
+blocksize=128
+total_elements=262144
+kernel_time_ms=0.0154
+bandwidth_gbps=39.84
+output_file=mx_results/dequant_512x512_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log
new file mode 100644
index 0000000..5b53834
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_512x512_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_512x512_bs64.bin
+rows=512
+cols=512
+blocksize=64
+total_elements=262144
+kernel_time_ms=0.0154
+bandwidth_gbps=39.91
+output_file=mx_results/dequant_512x512_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log
new file mode 100644
index 0000000..0afbdb3
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs128.bin
+rows=6578
+cols=1236
+blocksize=128
+total_elements=8130408
+kernel_time_ms=0.0402
+bandwidth_gbps=472.88
+output_file=mx_results/dequant_6578x1236_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log
new file mode 100644
index 0000000..ae554f6
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_6578x1236_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_6578x1236_bs64.bin
+rows=6578
+cols=1236
+blocksize=64
+total_elements=8130408
+kernel_time_ms=0.0485
+bandwidth_gbps=392.61
+output_file=mx_results/dequant_6578x1236_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log
new file mode 100644
index 0000000..6b99edc
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs128.bin
+rows=7000
+cols=7000
+blocksize=128
+total_elements=49000000
+kernel_time_ms=0.1881
+bandwidth_gbps=608.44
+output_file=mx_results/dequant_7000x7000_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log
new file mode 100644
index 0000000..3ea1f6d
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_7000x7000_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_7000x7000_bs64.bin
+rows=7000
+cols=7000
+blocksize=64
+total_elements=49000000
+kernel_time_ms=0.1905
+bandwidth_gbps=602.62
+output_file=mx_results/dequant_7000x7000_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log
new file mode 100644
index 0000000..b10832c
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs128.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs128.bin
+rows=8192
+cols=8192
+blocksize=128
+total_elements=67108864
+kernel_time_ms=0.2358
+bandwidth_gbps=664.67
+output_file=mx_results/dequant_8192x8192_bs128.fp16
diff --git a/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log
new file mode 100644
index 0000000..7d0ca25
--- /dev/null
+++ b/03_nf4_dequant/ayepei/mx_results/perf_8192x8192_bs64.log
@@ -0,0 +1,8 @@
+input_file=../weight_data/weight_8192x8192_bs64.bin
+rows=8192
+cols=8192
+blocksize=64
+total_elements=67108864
+kernel_time_ms=0.2625
+bandwidth_gbps=599.08
+output_file=mx_results/dequant_8192x8192_bs64.fp16
diff --git a/03_nf4_dequant/ayepei/nv/compare_results.py b/03_nf4_dequant/ayepei/nv/compare_results.py
new file mode 100644
index 0000000..374f226
--- /dev/null
+++ b/03_nf4_dequant/ayepei/nv/compare_results.py
@@ -0,0 +1,146 @@
+import torch
+import numpy as np
+import pandas as pd
+import csv
+from pathlib import Path
+
+
+def compare_all():
+    """对比所有 cuda 和 BnB 的结果"""
+
+    BASE = Path("..")
+
+    cuda_DIR = BASE / "cuda_results"
+    BNB_DIR = BASE / "bnb_results"
+    BNB_CSV = BASE / "bnb_benchmark_results.csv"
+
+    results = []
+
+    # 读取 BnB benchmark CSV
+    if not BNB_CSV.exists():
+        print(f" 找不到 {BNB_CSV}")
+        return
+
+    bnb_data = {}
+    with open(BNB_CSV, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            key = f"{row['Shape']}_bs{row['Blocksize']}"
+            bnb_data[key] = {
+                'time_ms': float(row['BnB Time (ms)']),
+                'bnb_file': row['BnB Output File']
+            }
+
+    # 查找所有 cuda 结果
+    cuda_files = list(cuda_DIR.glob("dequant_*.fp16"))
+
+    print("\n" + "="*80)
+    print("对比结果汇总")
+    print("="*80)
+    print(f"{'Shape':<12} {'Block':<6} {'BnB (ms)':<12} {'cuda (ms)':<12} "
+          f"{'Speedup':<8} {'MAE':<12} {'Max Diff':<12}")
+    print("-"*80)
+
+    for cuda_file in cuda_files:
+        filename = cuda_file.name
+
+        parts = filename.replace('dequant_', '').replace('.fp16', '').split('_bs')
+        shape = parts[0]
+        blocksize = parts[1]
+
+        # 读取 cuda 输出
+        cuda_data = np.fromfile(cuda_file, dtype=np.float16)
+        cuda_tensor = torch.from_numpy(cuda_data)
+
+        # 读取 BnB 输出
+        bnb_file = BNB_DIR / f"bnb_{shape}_bs{blocksize}.fp16"
+        if not bnb_file.exists():
+            print(f" 找不到 BnB 文件: {bnb_file}")
+            continue
+
+        bnb_data_np = np.fromfile(bnb_file, dtype=np.float16)
+        bnb_tensor = torch.from_numpy(bnb_data_np)
+
+
+
+        # 误差计算
+        mae = torch.mean(torch.abs(bnb_tensor - cuda_tensor)).item()
+        mse = torch.mean((bnb_tensor - cuda_tensor) ** 2).item()
+        max_diff = torch.max(torch.abs(bnb_tensor - cuda_tensor)).item()
+
+        # 时间读取
+        key = f"{shape}_bs{blocksize}"
+        bnb_time = bnb_data[key]['time_ms']
+
+        log_file = cuda_DIR / f"perf_{shape}_bs{blocksize}.log"
+        cuda_time = 0
+
+        if log_file.exists():
+            with open(log_file, 'r') as f:
+                for line in f:
+                    if 'kernel_time_ms' in line:
+                        cuda_time = float(line.strip().split('=')[1])
+
+        speedup = bnb_time / cuda_time if cuda_time > 0 else 0
+
+        print(f"{shape:<12} {blocksize:<6} "
+              f"{bnb_time:<12.4f} {cuda_time:<12.4f} "
+              f"{speedup:<8.2f} {mae:<12.8f} {max_diff:<12.8f}")
+
+        results.append({
+            'shape': shape,
+            'blocksize': int(blocksize),
+            'bnb_time_ms': bnb_time,
+            'cuda_time_ms': cuda_time,
+            'speedup': speedup,
+            'mae': mae,
+            'mse': mse,
+            'max_diff': max_diff,
+        })
+
+    print("="*80)
+
+
+    # 保存 CSV
+    df = pd.DataFrame(results)
+    out_csv =  "comparison_cuda_results.csv"
+    float_format = '%.8f' 
+    df.to_csv(out_csv, index=False, float_format=float_format)
+
+    print(f"\n 对比结果已保存到: {out_csv}")
+    md_file = "comparison_cuda_results.md"
+    with open(md_file, 'w', encoding='utf-8') as f:
+        # 写入表格标题
+        f.write("# cuda vs BnB 对比结果\n\n")
+        
+        # 写入表头
+        f.write("| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |\n")
+        f.write("|-------|-------|----------|-----------|---------|-----|----------|\n")
+        
+        # 写入数据
+        for _, row in df.iterrows():
+            line = (f"| {row['shape']} | {row['blocksize']} | "
+                   f"{row['bnb_time_ms']:.4f} | {row['cuda_time_ms']:.4f} | "
+                   f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['max_diff']:.8f} |")
+            f.write(line + '\n')
+        
+        # 添加详细数据表
+        f.write("\n\n## 详细数据\n\n")
+        f.write("| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | MSE | Max Diff |\n")
+        f.write("|-------|-------|----------|-----------|---------|-----|-----|----------|\n")
+        
+        for _, row in df.iterrows():
+            line = (f"| {row['shape']} | {row['blocksize']} | "
+                   f"{row['bnb_time_ms']:.4f} | {row['cuda_time_ms']:.4f} | "
+                   f"{row['speedup']:.2f} | {row['mae']:.8f} | {row['mse']:.8f} | "
+                   f"{row['max_diff']:.8f}  |")
+            f.write(line + '\n')
+    
+    print(f" Markdown表格已保存到: {md_file}")
+
+    return df
+
+
+
+if __name__ == "__main__":
+    compare_all()
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv
new file mode 100644
index 0000000..8e1718b
--- /dev/null
+++ b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.csv
@@ -0,0 +1,21 @@
+shape,blocksize,bnb_time_ms,cuda_time_ms,speedup,mae,mse,max_diff
+4096x4096,128,0.09050000,0.01340000,6.75373134,0.00020349,0.00000018,0.00390625
+4096x4096,64,0.08910000,0.01370000,6.50364964,0.00018179,0.00000018,0.00390625
+3421x3146,64,0.09250000,0.00860000,10.75581395,0.00017142,0.00000012,0.00390625
+7000x7000,128,0.09390000,0.04120000,2.27912621,0.00025988,0.00000024,0.00390625
+2048x2048,64,0.09100000,0.00470000,19.36170213,0.00022352,0.00000018,0.00390625
+8192x8192,128,0.09120000,0.05520000,1.65217391,0.00022542,0.00000018,0.00390625
+6578x1236,64,0.08950000,0.00700000,12.78571429,0.00022066,0.00000018,0.00390625
+2048x2048,128,0.09250000,0.00480000,19.27083333,0.00029397,0.00000030,0.00390625
+7000x7000,64,0.09090000,0.04160000,2.18509615,0.00024140,0.00000024,0.00390625
+256x256,128,0.10640000,0.00280000,38.00000000,0.00024271,0.00000024,0.00390625
+256x256,64,0.11440000,0.00280000,40.85714286,0.00022900,0.00000024,0.00390625
+6578x1236,128,0.08760000,0.00700000,12.51428571,0.00026894,0.00000024,0.00390625
+512x512,64,0.10100000,0.00280000,36.07142857,0.00018466,0.00000018,0.00390625
+16384x16384,128,0.29730000,0.21040000,1.41302281,0.00022352,0.00000018,0.00390625
+16384x16384,64,0.30740000,0.21100000,1.45687204,0.00019693,0.00000018,0.00390625
+8192x8192,64,0.09180000,0.05560000,1.65107914,0.00017440,0.00000012,0.00390625
+1024x1024,64,0.09600000,0.00300000,32.00000000,0.00028801,0.00000030,0.00390625
+3421x3146,128,0.09040000,0.00850000,10.63529412,0.00026965,0.00000024,0.00390625
+1024x1024,128,0.09120000,0.00300000,30.40000000,0.00024605,0.00000024,0.00390625
+512x512,128,0.09770000,0.00260000,37.57692308,0.00029063,0.00000030,0.00390625
diff --git a/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md
new file mode 100644
index 0000000..d6bd827
--- /dev/null
+++ b/03_nf4_dequant/ayepei/nv/comparison_cuda_results.md
@@ -0,0 +1,50 @@
+# cuda vs BnB 对比结果
+
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0134 | 6.75 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0137 | 6.50 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0086 | 10.76 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.0412 | 2.28 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.0552 | 1.65 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0070 | 12.79 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0048 | 19.27 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.0416 | 2.19 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.2104 | 1.41 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 0.2110 | 1.46 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.0556 | 1.65 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0030 | 32.00 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0085 | 10.64 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0030 | 30.40 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0026 | 37.58 | 0.00029063 | 0.00390625 |
+
+
+## 详细数据
+
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | MSE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0134 | 6.75 | 0.00020349 | 0.00000018 | 0.00390625  |
+| 4096x4096 | 64 | 0.0891 | 0.0137 | 6.50 | 0.00018179 | 0.00000018 | 0.00390625  |
+| 3421x3146 | 64 | 0.0925 | 0.0086 | 10.76 | 0.00017142 | 0.00000012 | 0.00390625  |
+| 7000x7000 | 128 | 0.0939 | 0.0412 | 2.28 | 0.00025988 | 0.00000024 | 0.00390625  |
+| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00000018 | 0.00390625  |
+| 8192x8192 | 128 | 0.0912 | 0.0552 | 1.65 | 0.00022542 | 0.00000018 | 0.00390625  |
+| 6578x1236 | 64 | 0.0895 | 0.0070 | 12.79 | 0.00022066 | 0.00000018 | 0.00390625  |
+| 2048x2048 | 128 | 0.0925 | 0.0048 | 19.27 | 0.00029397 | 0.00000030 | 0.00390625  |
+| 7000x7000 | 64 | 0.0909 | 0.0416 | 2.19 | 0.00024140 | 0.00000024 | 0.00390625  |
+| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00000024 | 0.00390625  |
+| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00000024 | 0.00390625  |
+| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00000024 | 0.00390625  |
+| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00000018 | 0.00390625  |
+| 16384x16384 | 128 | 0.2973 | 0.2104 | 1.41 | 0.00022352 | 0.00000018 | 0.00390625  |
+| 16384x16384 | 64 | 0.3074 | 0.2110 | 1.46 | 0.00019693 | 0.00000018 | 0.00390625  |
+| 8192x8192 | 64 | 0.0918 | 0.0556 | 1.65 | 0.00017440 | 0.00000012 | 0.00390625  |
+| 1024x1024 | 64 | 0.0960 | 0.0030 | 32.00 | 0.00028801 | 0.00000030 | 0.00390625  |
+| 3421x3146 | 128 | 0.0904 | 0.0085 | 10.64 | 0.00026965 | 0.00000024 | 0.00390625  |
+| 1024x1024 | 128 | 0.0912 | 0.0030 | 30.40 | 0.00024605 | 0.00000024 | 0.00390625  |
+| 512x512 | 128 | 0.0977 | 0.0026 | 37.58 | 0.00029063 | 0.00000030 | 0.00390625  |
diff --git a/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda
new file mode 100755
index 0000000..352cffb
Binary files /dev/null and b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda differ
diff --git a/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu
new file mode 100644
index 0000000..e213708
--- /dev/null
+++ b/03_nf4_dequant/ayepei/nv/nf4_dequant_cuda.cu
@@ -0,0 +1,643 @@
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h> 
+#include <sys/time.h>
+
+#define CUDA_CHECK(call)                                                       \
+  do {                                                                         \
+    cudaError_t err = call;                                                    \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__,        \
+              cudaGetErrorString(err));                                        \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// 将表放入到常量内存加快速度
+__constant__ __half NF4_LUT_HALF[16];
+__constant__ __half CODE2_LUT[256];
+
+void ensure_directory_exists(const char *path) {
+  struct stat st = {0};
+  if (stat(path, &st) == -1) {
+#ifdef _WIN32
+    mkdir(path);
+#else
+    mkdir(path, 0755);
+#endif
+  }
+}
+__constant__ float NF4_LUT[16] = {
+    -1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f,
+    -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f,
+    0.07958030f,  0.16093020f,  0.24611230f,  0.33791524f,
+    0.44070983f,  0.56261700f,  0.72295684f,  1.00000000f};
+
+// 初始化 LUT (将 float 转换为 half)
+void init_nf4_lut() {
+  float lut_f[16] = {-1.00000000f, -0.69619280f, -0.52507305f, -0.39491710f,
+                     -0.28444138f, -0.18477343f, -0.09105003f, 0.00000000f,
+                     0.07958030f,  0.16093020f,  0.24611230f,  0.33791524f,
+                     0.44070983f,  0.56261700f,  0.72295684f,  1.00000000f};
+
+  __half lut_h[16];
+  for (int i = 0; i < 16; i++) {
+    lut_h[i] = __float2half(lut_f[i]);
+  }
+  CUDA_CHECK(cudaMemcpyToSymbol(NF4_LUT_HALF, lut_h, sizeof(lut_h)));
+}
+// 基础nf4反量化，在计算中使用float进行计算
+__global__ void nf4_dequant_v1(
+    const uint8_t *__restrict__ packed, const uint8_t *__restrict__ absmax_q,
+    const __half *__restrict__ absmax2, // 修改为 __half
+    const __half *__restrict__ code2,   // 修改为 __half
+    float offset, int64_t total_half_elements, int blocksize, int group_size,
+    __half2 *__restrict__ output) {
+
+  int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= total_half_elements)
+    return;
+
+  uint8_t val = packed[tid];
+
+  int64_t idx_in_elements = tid << 1;
+  int64_t block_idx = idx_in_elements / blocksize;
+  int64_t group_idx = block_idx / group_size;
+
+
+  float s1 = __half2float(code2[absmax_q[block_idx]]);
+  float s2 = __half2float(absmax2[group_idx]);
+  float scale = (s1 * s2) + offset;
+
+  float v1 = NF4_LUT[val >> 4] * scale;
+  float v2 = NF4_LUT[val & 0x0F] * scale;
+
+  output[tid] = __floats2half2_rn(v1, v2);
+}
+
+
+// 中间的计算采用half
+__global__ void nf4_dequant_v2(
+    const uint8_t *__restrict__ packed,
+    const uint8_t *__restrict__ absmax_q,
+    const __half *__restrict__ absmax2,
+    const __half *__restrict__ code2,
+    float offset,
+    int64_t total_half_elements,
+    int blocksize,
+    int group_size,
+    __half2 *__restrict__ output)
+{
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_half_elements) return;
+
+    uint8_t val = packed[tid];
+
+    int64_t idx = tid << 1;
+    int64_t block_idx = idx / blocksize;
+    int64_t group_idx = block_idx / group_size;
+
+    // 全half scale
+    __half s1 = code2[absmax_q[block_idx]];
+    __half s2 = absmax2[group_idx];
+
+    __half scale = __hadd(__hmul(s1, s2),__float2half(offset) );
+
+    // LUT
+    __half v1 = __hmul(__float2half(NF4_LUT[val >> 4]), scale);
+    __half v2 = __hmul(__float2half(NF4_LUT[val & 0x0F]), scale);
+
+    output[tid] = __halves2half2(v1, v2);
+}
+
+
+
+// 预先先对NF4_LUT_HALF进行处理，转化为__half
+__global__ void nf4_dequant_v3(
+    const uint8_t *__restrict__ packed,
+    const uint8_t *__restrict__ absmax_q,
+    const __half *__restrict__ absmax2,
+    const __half *__restrict__ code2,
+    float offset,
+    int64_t total_half_elements,
+    int blocksize,
+    int group_size,
+    __half2 *__restrict__ output)
+{
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_half_elements) return;
+    uint8_t val = packed[tid];
+    int64_t idx = tid << 1;
+    int64_t block_idx = idx / blocksize;
+    int64_t group_idx = block_idx / group_size;
+    __half scale = __hadd(__hmul(code2[absmax_q[block_idx]], absmax2[group_idx]),__float2half(offset) );
+    __half v1 = __hmul(NF4_LUT_HALF[val >> 4], scale);
+    __half v2 = __hmul(NF4_LUT_HALF[val & 0x0F], scale);
+
+    output[tid] = __halves2half2(v1, v2);
+}
+// 利用共享内存存储scale，但当几个线程在进行存储时其他的线程并没有做其他的处理，时间并没有缩短
+__global__ void nf4_dequant_v4(
+    const uint8_t *__restrict__ packed,
+    const uint8_t *__restrict__ absmax_q,
+    const __half *__restrict__ absmax2,
+    const __half *__restrict__ code2,
+    float offset,
+    int64_t total_half_elements,
+    int blocksize,
+    int group_size,
+    __half2 *__restrict__ output)
+{
+    extern __shared__ __half shared_scale[];
+
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + tid;
+
+    int64_t cta_half_start = (int64_t)blockIdx.x * blockDim.x * 2;
+    int64_t cta_half_end   = min(cta_half_start + blockDim.x * 2,total_half_elements * 2);
+
+    int first_block = cta_half_start / blocksize;
+    int last_block  = (cta_half_end + blocksize - 1) / blocksize;
+
+    int num_scale = last_block - first_block;
+
+    for (int i = tid; i < num_scale; i += blockDim.x)
+    {
+        int block_idx = first_block + i;
+        int group_idx = block_idx / group_size;
+
+        __half s1 = code2[absmax_q[block_idx]];
+        __half s2 = absmax2[group_idx];
+
+        shared_scale[i] = __hadd(__hmul(s1, s2), __float2half(offset));
+    }
+
+    __syncthreads();
+
+    if (idx >= total_half_elements) return;
+
+    uint8_t val = packed[idx];
+
+    int64_t half_idx = ((int64_t)idx) << 1;
+
+    int block_idx   = half_idx / blocksize;
+    int local_block = block_idx - first_block;
+
+    __half scale = shared_scale[local_block];
+
+    __half v1 = __hmul(NF4_LUT_HALF[val >> 4], scale);
+    __half v2 = __hmul(NF4_LUT_HALF[val & 0xF], scale);
+
+    output[idx] = __halves2half2(v1, v2);
+}
+
+// 进行向量化读取，一个程序处理多个
+__global__ void nf4_dequant_v5(
+    const uint8_t *__restrict__ packed,
+    const uint8_t *__restrict__ absmax_q,
+    const __half *__restrict__ absmax2,
+    const __half *__restrict__ code2,
+    float offset,
+    int64_t total_half_elements,
+    int blocksize,
+    int group_size,
+    __half *__restrict__ output)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total_bytes = total_half_elements >> 1;   // 2 half / byte
+    int64_t byte_idx = (int64_t)tid * 4;
+    if (byte_idx >= total_bytes) return;
+    uint32_t pack4 = ((const uint32_t*)packed)[tid];
+    uint8_t b0 =  pack4        & 0xFF;
+    uint8_t b1 = (pack4 >> 8 ) & 0xFF;
+    uint8_t b2 = (pack4 >> 16) & 0xFF;
+    uint8_t b3 = (pack4 >> 24) & 0xFF;
+    int64_t half_base = byte_idx << 1;
+    int block_idx = half_base / blocksize;
+    int group_idx = block_idx / group_size;
+    __half scale = __hadd(__hmul(code2[absmax_q[block_idx]], absmax2[group_idx]), __float2half(offset));
+    __half h[8];
+    h[0]   = __hmul(NF4_LUT_HALF[(b0)>>4], scale); 
+    h[1] = __hmul(NF4_LUT_HALF[(b0)&0xF], scale);
+    h[2]   = __hmul(NF4_LUT_HALF[(b1)>>4], scale); 
+    h[3] = __hmul(NF4_LUT_HALF[(b1)&0xF], scale);
+    h[4]   = __hmul(NF4_LUT_HALF[(b2)>>4], scale); 
+    h[5] = __hmul(NF4_LUT_HALF[(b2)&0xF], scale);
+    h[6]   = __hmul(NF4_LUT_HALF[(b3)>>4], scale); 
+    h[7] = __hmul(NF4_LUT_HALF[(b3)&0xF], scale);
+
+    uint4 out_pack;
+    reinterpret_cast<__half*>(&out_pack)[0] = h[0];
+    reinterpret_cast<__half*>(&out_pack)[1] = h[1];
+    reinterpret_cast<__half*>(&out_pack)[2] = h[2];
+    reinterpret_cast<__half*>(&out_pack)[3] = h[3];
+    reinterpret_cast<__half*>(&out_pack)[4] = h[4];
+    reinterpret_cast<__half*>(&out_pack)[5] = h[5];
+    reinterpret_cast<__half*>(&out_pack)[6] = h[6];
+    reinterpret_cast<__half*>(&out_pack)[7] = h[7];
+
+    ((uint4*)(output + half_base))[0] = out_pack;
+}
+// 将code也放入常量内存
+__global__ void nf4_dequant_v6(const uint8_t *__restrict__ packed,
+                               const uint8_t *__restrict__ absmax_q,
+                               const __half *__restrict__ absmax2, float offset,
+                               int64_t total_elements, int blocksize,
+                               int group_size, __half *__restrict__ output) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t total_bytes = total_elements >> 1; // 2 half / byte
+  int64_t byte_idx = (int64_t)tid * 4;
+  if (byte_idx >= total_bytes)
+    return;
+  uint32_t pack4 = ((const uint32_t *)packed)[tid];
+  uint8_t b0 = pack4 & 0xFF;
+  uint8_t b1 = (pack4 >> 8) & 0xFF;
+  uint8_t b2 = (pack4 >> 16) & 0xFF;
+  uint8_t b3 = (pack4 >> 24) & 0xFF;
+  int64_t half_base = byte_idx << 1;
+  int block_idx = half_base / blocksize;
+  int group_idx = block_idx / group_size;
+  __half scale =
+      __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]),
+             __float2half(offset));
+  __half h[8];
+  h[0] = __hmul(NF4_LUT_HALF[(b0) >> 4], scale);
+  h[1] = __hmul(NF4_LUT_HALF[(b0) & 0xF], scale);
+  h[2] = __hmul(NF4_LUT_HALF[(b1) >> 4], scale);
+  h[3] = __hmul(NF4_LUT_HALF[(b1) & 0xF], scale);
+  h[4] = __hmul(NF4_LUT_HALF[(b2) >> 4], scale);
+  h[5] = __hmul(NF4_LUT_HALF[(b2) & 0xF], scale);
+  h[6] = __hmul(NF4_LUT_HALF[(b3) >> 4], scale);
+  h[7] = __hmul(NF4_LUT_HALF[(b3) & 0xF], scale);
+
+  uint4 out_pack;
+  reinterpret_cast<__half *>(&out_pack)[0] = h[0];
+  reinterpret_cast<__half *>(&out_pack)[1] = h[1];
+  reinterpret_cast<__half *>(&out_pack)[2] = h[2];
+  reinterpret_cast<__half *>(&out_pack)[3] = h[3];
+  reinterpret_cast<__half *>(&out_pack)[4] = h[4];
+  reinterpret_cast<__half *>(&out_pack)[5] = h[5];
+  reinterpret_cast<__half *>(&out_pack)[6] = h[6];
+  reinterpret_cast<__half *>(&out_pack)[7] = h[7];
+
+  ((uint4 *)(output + half_base))[0] = out_pack;
+}
+// 将NF4_LUT由常量内存放到共享内存中来
+__global__ void nf4_dequant_v7(const uint8_t *__restrict__ packed,
+                               const uint8_t *__restrict__ absmax_q,
+                               const __half *__restrict__ absmax2, float offset,
+                               int64_t total_elements, int blocksize,
+                               int group_size, __half *__restrict__ output) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t total_bytes = total_elements >> 1; // 2 half / byte
+  int64_t byte_idx = (int64_t)tid * 4;
+  __shared__ __half s_nf4[16];
+
+  if(threadIdx.x<16){
+    s_nf4[threadIdx.x]=NF4_LUT_HALF[threadIdx.x];
+  }
+  if (byte_idx >= total_bytes)
+    return;
+  uint32_t pack4 = ((const uint32_t *)packed)[tid];
+  uint8_t b0 = pack4 & 0xFF;
+  uint8_t b1 = (pack4 >> 8) & 0xFF;
+  uint8_t b2 = (pack4 >> 16) & 0xFF;
+  uint8_t b3 = (pack4 >> 24) & 0xFF;
+  int64_t half_base = byte_idx << 1;
+  int block_idx = half_base / blocksize;
+  int group_idx = block_idx / group_size;
+  __half scale =
+      __hadd(__hmul(CODE2_LUT[absmax_q[block_idx]], absmax2[group_idx]),
+             __float2half(offset));
+  __half h[8];
+  h[0] = __hmul(s_nf4[(b0) >> 4], scale);
+  h[1] = __hmul(s_nf4[(b0) & 0xF], scale);
+  h[2] = __hmul(s_nf4[(b1) >> 4], scale);
+  h[3] = __hmul(s_nf4[(b1) & 0xF], scale);
+  h[4] = __hmul(s_nf4[(b2) >> 4], scale);
+  h[5] = __hmul(s_nf4[(b2) & 0xF], scale);
+  h[6] = __hmul(s_nf4[(b3) >> 4], scale);
+  h[7] = __hmul(s_nf4[(b3) & 0xF], scale);
+
+  uint4 out_pack;
+  reinterpret_cast<__half *>(&out_pack)[0] = h[0];
+  reinterpret_cast<__half *>(&out_pack)[1] = h[1];
+  reinterpret_cast<__half *>(&out_pack)[2] = h[2];
+  reinterpret_cast<__half *>(&out_pack)[3] = h[3];
+  reinterpret_cast<__half *>(&out_pack)[4] = h[4];
+  reinterpret_cast<__half *>(&out_pack)[5] = h[5];
+  reinterpret_cast<__half *>(&out_pack)[6] = h[6];
+  reinterpret_cast<__half *>(&out_pack)[7] = h[7];
+
+  ((uint4 *)(output + half_base))[0] = out_pack;
+}
+// 读取权重文件
+int read_weight_file(const char *filename, int64_t *rows, int64_t *cols,
+                     int *blocksize, uint8_t **packed, uint8_t **absmax_q,
+                     __half **absmax2, __half **code2, float *offset) {
+
+  FILE *fp = fopen(filename, "rb");
+  if (!fp) {
+    fprintf(stderr, " 无法打开文件: %s\n", filename);
+    return -1;
+  }
+
+  // 读取 header
+  fread(rows, sizeof(int64_t), 1, fp);
+  fread(cols, sizeof(int64_t), 1, fp);
+  fread(blocksize, sizeof(int32_t), 1, fp);
+
+  int64_t total_elements = (*rows) * (*cols);
+  int64_t num_packed = (total_elements + 1) / 2;
+  int64_t num_blocks = (total_elements + *blocksize - 1) / *blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  printf("\n 文件信息:\n");
+  printf("  矩阵: %ld x %ld\n", *rows, *cols);
+  printf("  总元素数: %ld\n", total_elements);
+  printf("  blocksize: %d\n", *blocksize);
+  printf("  打包数据大小: %ld bytes\n", num_packed);
+  printf("  量化块数: %ld\n", num_blocks);
+  printf("  分组数: %ld\n", num_groups);
+
+  // 分配内存
+  *packed = (uint8_t *)malloc(num_packed);
+  *absmax_q = (uint8_t *)malloc(num_blocks);
+  *absmax2 = (__half *)malloc(num_groups * sizeof(__half));
+  *code2 = (__half *)malloc(256 * sizeof(__half));
+
+  if (!*packed || !*absmax_q || !*absmax2 || !*code2) {
+    fprintf(stderr, " 主机内存分配失败\n");
+    fclose(fp);
+    return -1;
+  }
+
+  // 读取数据
+  fread(*packed, 1, num_packed, fp);
+  fread(*absmax_q, 1, num_blocks, fp);
+  fread(*absmax2, sizeof(__half), num_groups, fp);
+  fread(*code2, sizeof(__half), 256, fp);
+  fread(offset, sizeof(float), 1, fp);
+
+  fclose(fp);
+  printf(" 文件读取成功\n");
+  return 0;
+}
+
+// 保存解量化后的权重（自动保存到cuda_results目录）
+void save_dequantized_weight(const char *filename, __half *weight,
+                             int64_t total_elements) {
+  ensure_directory_exists("../cuda_results");
+
+  // 构建完整路径
+  char full_path[512];
+  snprintf(full_path, sizeof(full_path), "../cuda_results/%s", filename);
+
+  FILE *fp = fopen(full_path, "wb");
+  if (!fp) {
+    fprintf(stderr, " 无法创建输出文件: %s\n", full_path);
+    return;
+  }
+
+  fwrite(weight, sizeof(__half), total_elements, fp);
+  fclose(fp);
+
+  printf(" 已保存解量化结果: %s (%.2f MB)\n", full_path,
+         (total_elements * sizeof(__half)) / (1024.0 * 1024.0));
+}
+
+
+// 计时器 (毫秒)
+double get_time_ms() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
+}
+
+
+// 计算有效内存带宽
+double calculate_bandwidth(int64_t total_elements, double time_ms) {
+  // 输入数据大小
+  int64_t input_bytes = (total_elements + 1) / 2; // packed
+  input_bytes += (total_elements + 64 - 1) / 64;  // absmax_q (近似)
+  input_bytes +=
+      ((total_elements + 64 - 1) / 64 + 255) / 256 * sizeof(__half); // absmax2
+  input_bytes += 256 * sizeof(__half);                               // code2
+
+  // 输出数据大小
+  int64_t output_bytes = total_elements * sizeof(__half);
+
+  int64_t total_bytes = input_bytes + output_bytes;
+
+  return (total_bytes / (1024.0 * 1024.0 * 1024.0)) / (time_ms / 1000.0);
+}
+
+// ============================================================
+// 主函数
+// ============================================================
+int main(int argc, char **argv) {
+
+  if (argc != 2) {
+    printf("\n使用方法: %s <权重文件.bin>\n", argv[0]);
+    printf("  权重文件格式: 由 Python 脚本生成的 .bin 文件\n");
+    printf("  示例: %s weight_data/weight_1024x1024_bs64.bin\n\n", argv[0]);
+    return -1;
+  }
+
+  const char *input_file = argv[1];
+
+  // 确保输出目录存在
+  ensure_directory_exists("../cuda_results");
+
+  // 初始化 LUT
+  printf("\n 初始化 NF4 LUT...\n");
+  init_nf4_lut();
+
+  // 读取权重文件
+  printf("\n 读取权重文件: %s\n", input_file);
+  int64_t rows, cols;
+  int blocksize;
+  uint8_t *h_packed, *h_absmax_q;
+  __half *h_absmax2, *h_code2;
+  float offset;
+
+  if (read_weight_file(input_file, &rows, &cols, &blocksize, &h_packed,
+                       &h_absmax_q, &h_absmax2, &h_code2, &offset) != 0) {
+    return -1;
+  }
+
+  int64_t total_elements = rows * cols;
+  int64_t num_units = (total_elements + 1) / 2; // 每个 uint8 包含两个 half
+
+  // 计算 GPU 内存大小
+  int64_t num_blocks = (total_elements + blocksize - 1) / blocksize;
+  int64_t num_groups = (num_blocks + 255) / 256;
+
+  printf("\n 计算参数:\n");
+  printf("  total_elements: %ld\n", total_elements);
+  printf("  num_units: %ld\n", num_units);
+  printf("  num_blocks: %ld\n", num_blocks);
+  printf("  num_groups: %ld\n", num_groups);
+
+  // 分配 GPU 内存
+  printf("\n 分配 GPU 内存...\n");
+  uint8_t *d_packed, *d_absmax_q;
+  __half *d_absmax2, *d_code2, *d_output;
+
+  CUDA_CHECK(cudaMalloc(&d_packed, num_units));
+  CUDA_CHECK(cudaMalloc(&d_absmax_q, num_blocks));
+  CUDA_CHECK(cudaMalloc(&d_absmax2, num_groups * sizeof(__half)));
+  CUDA_CHECK(cudaMalloc(&d_code2, 256 * sizeof(__half)));
+  CUDA_CHECK(cudaMalloc(&d_output, total_elements * sizeof(__half)));
+
+  // 拷贝数据到 GPU
+  printf(" 拷贝数据到 GPU...\n");
+  CUDA_CHECK(cudaMemcpy(d_packed, h_packed, num_units, cudaMemcpyHostToDevice));
+  CUDA_CHECK(
+      cudaMemcpy(d_absmax_q, h_absmax_q, num_blocks, cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_absmax2, h_absmax2, num_groups * sizeof(__half),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_code2, h_code2, 256 * sizeof(__half),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpyToSymbol(CODE2_LUT, h_code2, 256 * sizeof(__half)));
+
+  // 分配主机输出内存
+  __half *h_output = (__half *)malloc(total_elements * sizeof(__half));
+  if (!h_output) {
+    fprintf(stderr, " 主机输出内存分配失败\n");
+    return -1;
+  }
+
+  // 配置内核启动参数
+  int threads = 256;
+  int64_t total_bytes = total_elements >> 1;
+  int blocks = (total_bytes / 4 + threads - 1) / threads;
+    // int blocks = (total_bytes + threads - 1) / threads;
+    int max_scale = (threads * 2 + blocksize - 1) / blocksize;
+
+size_t smem = max_scale * sizeof(__half);
+  printf("\n 内核配置:\n");
+  printf("  blocks: %d\n", blocks);
+  printf("  threads per block: %d\n", threads);
+  printf("  总线程数: %d\n", blocks * threads);
+
+  // 预热 (5次)
+  printf("\n 预热 (5次)...\n");
+  for (int i = 0; i < 5; i++) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset,
+                                        total_elements, blocksize, 256,
+                                        d_output);
+    // nf4_dequant_v5<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset,
+    //                                     total_elements, blocksize, 256,
+    //                                     d_output);
+    // nf4_dequant_v1<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset,
+    //                                     total_elements, blocksize, 256,
+    //                                     reinterpret_cast<__half2*>(d_output));
+
+    //  nf4_dequant_v4<<<blocks, threads, smem>>>(d_packed, d_absmax_q, d_absmax2, d_code2,
+    //                                             offset, total_elements, blocksize, 256,
+    //                                           reinterpret_cast<__half2*>(d_output));
+                                        
+  }
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  // 正式测试 (100次)
+  printf("  性能测试 (100次迭代)...\n");
+  double start_time = get_time_ms();
+
+  for (int i = 0; i < 100; i++) {
+    nf4_dequant_v7<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2, offset,
+                                        total_elements, blocksize, 256,
+                                        d_output);
+    // nf4_dequant_v5<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset,
+    //                                     total_elements, blocksize, 256,
+    //                                     d_output);
+    // nf4_dequant_v1<<<blocks, threads>>>(d_packed, d_absmax_q, d_absmax2,d_code2, offset,
+    //                                     total_elements, blocksize, 256,
+    //                                     reinterpret_cast<__half2*>(d_output));
+    //  nf4_dequant_v4<<<blocks, threads, smem>>>(d_packed, d_absmax_q, d_absmax2, d_code2,
+    //                                             offset, total_elements, blocksize, 256,
+    //                                           reinterpret_cast<__half2*>(d_output));
+  }
+
+  CUDA_CHECK(cudaDeviceSynchronize());
+  double end_time = get_time_ms();
+
+  double total_time = end_time - start_time;
+  double avg_time_ms = total_time / 100.0;
+
+  // 计算带宽
+  double bandwidth = calculate_bandwidth(total_elements, avg_time_ms);
+
+  // 拷贝结果回主机
+  printf(" 拷贝结果回主机...\n");
+  CUDA_CHECK(cudaMemcpy(h_output, d_output, total_elements * sizeof(__half),
+                        cudaMemcpyDeviceToHost));
+
+  // 生成输出文件名
+  char output_file[256];
+  snprintf(output_file, sizeof(output_file), "dequant_%ldx%ld_bs%d.fp16", rows,
+           cols, blocksize);
+
+  // 保存解量化结果（自动保存到cuda_results目录）
+  printf("\n 保存解量化结果...\n");
+  save_dequantized_weight(output_file, h_output, total_elements);
+
+  // 生成性能日志文件名
+  char log_file[256];
+  snprintf(log_file, sizeof(log_file), "perf_%ldx%ld_bs%d.log", rows, cols,
+           blocksize);
+
+  // 输出性能结果
+  printf("输入文件: %s\n", input_file);
+  printf("矩阵大小: %ld x %ld\n", rows, cols);
+  printf("总元素数: %ld\n", total_elements);
+  printf("数据大小: %.2f MB\n",
+         total_elements * sizeof(__half) / (1024.0 * 1024.0));
+  printf("\n");
+  printf("核函数执行时间: %.4f ms\n", avg_time_ms);
+  printf("有效内存带宽: %.2f GB/s\n", bandwidth);
+  printf("\n");
+  printf("输出文件: cuda_results/%s\n", output_file);
+  printf("日志文件: cuda_results/%s\n", log_file);
+
+
+  // 保存性能日志（也保存到cuda_results目录）
+  char log_path[512];
+  snprintf(log_path, sizeof(log_path), "../cuda_results/%s", log_file);
+
+  FILE *log_fp = fopen(log_path, "w");
+  if (log_fp) {
+    fprintf(log_fp, "input_file=%s\n", input_file);
+    fprintf(log_fp, "rows=%ld\n", rows);
+    fprintf(log_fp, "cols=%ld\n", cols);
+    fprintf(log_fp, "blocksize=%d\n", blocksize);
+    fprintf(log_fp, "total_elements=%ld\n", total_elements);
+    fprintf(log_fp, "kernel_time_ms=%.4f\n", avg_time_ms);
+    fprintf(log_fp, "bandwidth_gbps=%.2f\n", bandwidth);
+    fprintf(log_fp, "output_file=cuda_results/%s\n", output_file);
+    fclose(log_fp);
+    printf(" 性能日志已保存到: %s\n", log_path);
+  }
+
+  // 清理
+  free(h_packed);
+  free(h_absmax_q);
+  free(h_absmax2);
+  free(h_code2);
+  free(h_output);
+
+  cudaFree(d_packed);
+  cudaFree(d_absmax_q);
+  cudaFree(d_absmax2);
+  cudaFree(d_code2);
+  cudaFree(d_output);
+
+  printf("\n 测试完成!\n\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/ayepei/nv/run_all.sh b/03_nf4_dequant/ayepei/nv/run_all.sh
new file mode 100755
index 0000000..a02a276
--- /dev/null
+++ b/03_nf4_dequant/ayepei/nv/run_all.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# run_all.sh
+
+# 确保 CUDA 程序已编译
+if [ ! -f "./nf4_dequant_cuda" ]; then
+    echo "编译 CUDA 程序..."
+    nvcc -O3 -arch=sm_90 --use_fast_math -lineinfo -Xptxas -O3 nf4_dequant_cuda.cu -o nf4_dequant_cuda
+fi
+
+# 处理所有权重文件
+for f in ../weight_data/*.bin; do
+     ./nf4_dequant_cuda "$f"
+done
\ No newline at end of file
diff --git "a/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md" "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md"
new file mode 100644
index 0000000..63e38e5
--- /dev/null
+++ "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.md"
@@ -0,0 +1,358 @@
+# NF4反量化
+# 实验平台
+英伟达：GH200
+沐曦：C500
+摩尔线程：S5000
+
+# 基准测试
+配置bitsandbytes环境进行基准测试，对于下列形状的权重矩阵生成，调用bitsandbytes进行量化及反量化，记录反量化的结果文件及运行的时间。与后续的实现进行平均误差和最大误差的对比。bitsandbytes测试在英伟达GH200进行。后续对比时各平台核函数先预热5次，再执行100次求平均时间。
+(256, 256)
+(512, 512)
+(1024, 1024)
+(2048, 2048)
+(4096, 4096)
+(8192, 8192)
+(16384, 16384)
+(3421, 3146)
+(6578, 1236)
+(7000, 7000)
+# python环境配置
+```
+conda create -n nf4 python=3.10 -y
+pip install torch --index-url https://download.pytorch.org/whl/cu124
+pip install bitsandbytes numpy pandas
+conda install -c nvidia cuda-nvcc cuda-runtime cuda-cudart -y
+cd 03_nf4_dequant/
+python generate_and_benchmark_bnb.py #生成bitsandbytes基准进行对比
+```
+# 代码文件结构
+因为权重文件存储太大，不上传到github中
+Learning-CUDA
+├── 03_nf4_dequant/ 
+│ ├── bnb_results/ # bitsandbytes 解量化输出结果
+│ ├── cuda_results/ # CUDA kernel 解量化输出结果
+│ ├── musa_results/ # MUSA 平台解量化结果
+│ ├── mx_results/ # MX 平台解量化结果
+│ ├── mexc/ # 摩尔线程平台实现代码
+│ ├── mx/ # 沐曦平台 kernel 实现
+│ ├── nv/ # CUDA kernel 实现
+│ ├── weight_data/ # NF4 量化后的权重测试数据
+│ ├── generate_and_benchmark_bnb.py # 生成 NF4 测试数据并运行 bitsandbytes benchmark
+│ ├── bnb_benchmark_results.csv # bitsandbytes benchmark 结果
+│ ├── 总结报告.md # NF4 解量化实验总结
+│ └── README.md 
+
+# nf4反量化理论
+```
+NF4[qi​]×(code2[absmax_q[block_idx]​]×absmax2​+offset)
+```
+# 英伟达平台程序执行及性能分析
+```sh
+cd nv
+./run_all.sh //对于程序进行编译和对于所有权重文件运行，生成结果文件及时间
+python compare_results.py //进行性能和误差分析
+```
+# 英伟达优化思路
+## nf4_dequant_v1:
+基础nf4反量化，实现两级缩放的解量化，建立NF4查找表(NF4_LUT)，在计算中使用float进行计算,一个线程计算一个元素，NF4_LUT放入常量内存。每个线程一次处理两个 4-bit 索引，计算两个 BF16 值后打包成一个 32-bit uint32_t 一次性写入全局内存。
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0666 | 1.36 | 0.00002557 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0678 | 1.31 | 0.00001973 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0436 | 2.12 | 0.00001746 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1926 | 0.49 | 0.00001764 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0183 | 4.97 | 0.00003022 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.3057 | 0.30 | 0.00002319 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0819 | 1.09 | 0.00003105 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0179 | 5.17 | 0.00001603 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.2325 | 0.39 | 0.00002593 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00002176 | 0.00195312 |
+| 256x256 | 64 | 0.1144 | 0.0027 | 42.37 | 0.00003350 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0326 | 2.69 | 0.00001675 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0032 | 31.56 | 0.00001991 | 0.00195312 |
+| 16384x16384 | 128 | 0.2973 | 1.6876 | 0.18 | 0.00002354 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.7599 | 0.17 | 0.00002462 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.3188 | 0.29 | 0.00001830 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0091 | 10.55 | 0.00001878 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0421 | 2.15 | 0.00001740 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0091 | 10.02 | 0.00001937 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0032 | 30.53 | 0.00001580 | 0.00195312 |
+## nf4_dequant_v2
+半精比float计算快，将中间的变量转化为半精执行。
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0670 | 1.35 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0681 | 1.31 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0437 | 2.12 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1929 | 0.49 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0185 | 4.92 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2631 | 0.35 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0335 | 2.67 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0181 | 5.11 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1984 | 0.46 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0027 | 39.41 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0027 | 42.37 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0328 | 2.67 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0033 | 30.61 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 1.0421 | 0.29 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.0718 | 0.29 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2703 | 0.34 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0066 | 14.55 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0426 | 2.12 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0065 | 14.03 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0032 | 30.53 | 0.00029063 | 0.00390625 |
+## nf4_dequant_v3
+NF4_LUT（NF4量化的查找表）每次都要从float转化为__half，在核函数启动前提前转化。
+下面对8192x8192大小的反量化进行ncu分析，下面各个核函数都是对8192×8192的分析。
+计算吞吐率达到90%，而内存吞吐只有10%左右。
+![NCU性能分析报告v3.1](img/v3_th.png)
+查看warp状态，Stall MIO Throtle占主要部分
+![NCU性能分析报告v3.2](img/v3_warp.png)
+查找具体是下面两个地方对于内存的读取导致的
+![NCU性能分析报告v3.3](img/v3_source_1.png)
+![NCU性能分析报告v3.4](img/v3_source_2.png)
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0669 | 1.35 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0679 | 1.31 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0438 | 2.11 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1932 | 0.49 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0186 | 4.89 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2636 | 0.35 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0337 | 2.66 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0180 | 5.14 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1987 | 0.46 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0027 | 39.41 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0326 | 2.69 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0035 | 28.86 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 1.0441 | 0.28 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.0733 | 0.29 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2710 | 0.34 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0066 | 14.55 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0428 | 2.11 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0065 | 14.03 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0034 | 28.74 | 0.00029063 | 0.00390625 |
+## nf4_dequant_v4
+利用共享内存存储scale，但当几个线程在进行存储时其他的线程在等待，时间并没有缩短。
+相比于v3，内存吞吐率更低了。
+![NCU性能分析报告v4.1](img/v4_th.png)
+查看共享内存状态大量bank conflicts
+![NCU性能分析报告v4.2](img/v4_bank.png)
+查看使用共享内存存在大量barrier
+![NCU性能分析报告v4.3](img/v4_source_1.png)
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0802 | 1.13 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0821 | 1.09 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0531 | 1.74 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.2308 | 0.41 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0224 | 4.06 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.3140 | 0.29 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0408 | 2.19 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0219 | 4.22 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.2386 | 0.38 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0033 | 32.24 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0032 | 35.75 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0397 | 2.21 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0040 | 25.25 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 1.2404 | 0.24 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.2822 | 0.24 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.3244 | 0.28 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0080 | 12.00 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0518 | 1.75 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0079 | 11.54 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0040 | 24.42 | 0.00029063 | 0.00390625 |
+## nf4_dequant_v5
+进行向量化读取，一个程序处理多个元素
+现在的计算吞吐达到95%，但内存吞吐还是很低。
+![NCU性能分析报告v5.1](img/v5_th.png)
+还是大量存在Stall MIO Throtle
+![NCU性能分析报告v5.2](img/v5_warp.png)
+对于NF4_LUT还是有大量的Stall MIO Throtle，还有对于计算scale也有Stall MIO Throtle
+![NCU性能分析报告v5.3](img/v5_source_1.png)
+![NCU性能分析报告v5.4](img/v5_source_2.png)
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0847 | 1.07 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0928 | 0.96 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0452 | 2.05 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.3667 | 0.26 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0197 | 4.62 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2261 | 0.40 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0401 | 2.23 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0193 | 4.79 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1721 | 0.53 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0060 | 17.73 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0062 | 18.45 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0433 | 2.02 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0034 | 29.71 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 1.4397 | 0.21 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.0827 | 0.28 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2343 | 0.39 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0094 | 10.21 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0445 | 2.03 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0058 | 15.72 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0067 | 14.58 | 0.00029063 | 0.00390625 |
+## nf4_dequant_v6：
+将code也放入常量内存，减少从全局内存进行读取
+现在的计算吞吐达到96%，但内存吞吐还是很低。
+![NCU性能分析报告v6.1](img/v6_th.png)
+还是大量存在Stall MIO Throtle
+![NCU性能分析报告v6.2](img/v6_warp.png)
+对于NF4_LUT还是有大量的Stall MIO Throtle，还有对于code2的Stall MIO Throtle有所减少。
+![NCU性能分析报告v6.3](img/v6_source_1.png)
+![NCU性能分析报告v6.4](img/v6_source_2.png)
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0594 | 1.52 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0621 | 1.43 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0404 | 2.29 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1671 | 0.56 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0176 | 5.17 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2273 | 0.40 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0316 | 2.83 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0168 | 5.51 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1754 | 0.52 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0035 | 30.40 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0035 | 32.69 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0300 | 2.92 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0035 | 28.86 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.8980 | 0.33 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 0.9441 | 0.33 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2389 | 0.38 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0062 | 15.48 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0385 | 2.35 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0060 | 15.20 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0036 | 27.14 | 0.00029063 | 0.00390625 |
+## nf4_dequant_v7：
+将NF4_LUT由常量内存放到共享内存中来，速度比v6版本大大提高了
+
+内存吞吐达到60%，且计算时间大大减少
+![NCU性能分析报告v7.1](img/v7_th.png)
+Stall MIO Throtle大大减少
+![NCU性能分析报告v7.2](img/v7_warp.png)
+引入共享内存后出现bank conflicts
+![NCU性能分析报告v7.3](img/v7_bank.png)
+对于NF4_LUT已经计算scale的Stall MIO Throtle消失
+![NCU性能分析报告v7.3](img/v7_source_1.png)
+![NCU性能分析报告v7.4](img/v7_source_2.png)
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0131 | 6.91 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0134 | 6.65 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0087 | 10.63 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.0449 | 2.09 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.0581 | 1.57 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0073 | 12.26 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0049 | 18.88 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.0418 | 2.17 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.2156 | 1.38 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 0.2292 | 1.34 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.0593 | 1.55 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0031 | 30.97 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0087 | 10.39 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0029 | 31.45 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0028 | 34.89 | 0.00029063 | 0.00390625 |
+# 英伟达平台适配结果
+对于GH200对nf4_dequant_v7进行支持。执行结果权重文件为cuda_results目录下的.fp16为后缀的文件，log文件记录带宽及其执行时间。加速比在nv/comparison_cuda_results.md
+| Shape | Block | BnB (ms) | cuda (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|----------|
+| 4096x4096 | 128 | 0.0905 | 0.0131 | 6.91 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0134 | 6.65 | 0.00018179 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0087 | 10.63 | 0.00017142 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.0449 | 2.09 | 0.00025988 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0047 | 19.36 | 0.00022352 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.0581 | 1.57 | 0.00022542 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0073 | 12.26 | 0.00022066 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0049 | 18.88 | 0.00029397 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.0418 | 2.17 | 0.00024140 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0028 | 38.00 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0028 | 40.86 | 0.00022900 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0070 | 12.51 | 0.00026894 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0028 | 36.07 | 0.00018466 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.2156 | 1.38 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 0.2292 | 1.34 | 0.00019693 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.0593 | 1.55 | 0.00017440 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0031 | 30.97 | 0.00028801 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0087 | 10.39 | 0.00026965 | 0.00390625 |
+| 1024x1024 | 128 | 0.0912 | 0.0029 | 31.45 | 0.00024605 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0028 | 34.89 | 0.00029063 | 0.00390625 |
+# 沐曦平台程序执行及性能分析
+```sh
+cd mx
+./run_all.sh 对于程序进行编译和对于所有权重文件运行
+python compare_results.py 进行性能和误差分析
+```
+# 沐曦适配结果
+对于沐曦S5000平台对nf4_dequant_v7进行支持，在大矩阵的表现较差。执行结果权重文件为mx_results目录下的.fp16为后缀的文件，log文件记录带宽及其执行时间。加速比在mx/comparison_mx_results.md
+| Shape | Block | BnB (ms) | MX (ms) | Speedup | MAE | Max Diff |
+|-------|-------|----------|---------|---------|-----|----------|
+| 1024x1024 | 128 | 0.0912 | 0.0248 | 3.68 | 0.00024605 | 0.00390625 |
+| 1024x1024 | 64 | 0.0960 | 0.0169 | 5.68 | 0.00028801 | 0.00390625 |
+| 16384x16384 | 128 | 0.2973 | 0.9282 | 0.32 | 0.00022352 | 0.00390625 |
+| 16384x16384 | 64 | 0.3074 | 1.0106 | 0.30 | 0.00019693 | 0.00390625 |
+| 2048x2048 | 128 | 0.0925 | 0.0325 | 2.85 | 0.00029397 | 0.00390625 |
+| 2048x2048 | 64 | 0.0910 | 0.0281 | 3.24 | 0.00022352 | 0.00390625 |
+| 256x256 | 128 | 0.1064 | 0.0093 | 11.44 | 0.00024271 | 0.00390625 |
+| 256x256 | 64 | 0.1144 | 0.0094 | 12.17 | 0.00022900 | 0.00390625 |
+| 3421x3146 | 128 | 0.0904 | 0.0509 | 1.78 | 0.00026965 | 0.00390625 |
+| 3421x3146 | 64 | 0.0925 | 0.0517 | 1.79 | 0.00017142 | 0.00390625 |
+| 4096x4096 | 128 | 0.0905 | 0.0813 | 1.11 | 0.00020349 | 0.00390625 |
+| 4096x4096 | 64 | 0.0891 | 0.0735 | 1.21 | 0.00018179 | 0.00390625 |
+| 512x512 | 128 | 0.0977 | 0.0154 | 6.34 | 0.00029063 | 0.00390625 |
+| 512x512 | 64 | 0.1010 | 0.0154 | 6.56 | 0.00018466 | 0.00390625 |
+| 6578x1236 | 128 | 0.0876 | 0.0402 | 2.18 | 0.00026894 | 0.00390625 |
+| 6578x1236 | 64 | 0.0895 | 0.0485 | 1.85 | 0.00022066 | 0.00390625 |
+| 7000x7000 | 128 | 0.0939 | 0.1881 | 0.50 | 0.00025988 | 0.00390625 |
+| 7000x7000 | 64 | 0.0909 | 0.1905 | 0.48 | 0.00024140 | 0.00390625 |
+| 8192x8192 | 128 | 0.0912 | 0.2358 | 0.39 | 0.00022542 | 0.00390625 |
+| 8192x8192 | 64 | 0.0918 | 0.2625 | 0.35 | 0.00017440 | 0.00390625 |
+
+# 摩尔线程程序执行及性能分析
+```sh
+cd mexc
+// 进行编译
+$MUSA_ROOT/bin/mcc -O3 -std=c++17 \
+-I$MUSA_ROOT/include \
+-L$MUSA_ROOT/lib \
+-L/usr/lib/gcc/x86_64-linux-gnu/11 \
+-lmusart -lstdc++ \
+nf4_dequant_musa.mu -o nf4_musa
+// 提交任务执行权重文件
+sbatch test.sbatch
+// 进行性能和误差分析
+python compare_results.py 
+```
+# 摩尔线程适配结果
+对于S5000对nf4_dequant_v7进行支持，在中大矩阵的表现差，且有效带宽在最大时也只有229GB/s。执行结果权重文件为mexc_results目录下的.fp16为后缀的文件，log文件记录带宽及其执行时间。加速比在mexc/comparison_musa_results.md
+| Shape | Block | BnB (ms) | MUSA (ms) | Speedup | MAE | MSE | Max Diff |
+|-------|-------|----------|-----------|---------|-----|-----|----------|
+| 2048x2048 | 128 | 0.0925 | 0.0435 | 2.13 | 0.00029397 | 0.00000030 | 0.00390625 | 
+| 512x512 | 64 | 0.1010 | 0.0080 | 12.62 | 0.00018466 | 0.00000018 | 0.00390625 | 
+| 256x256 | 64 | 0.1144 | 0.0074 | 15.46 | 0.00022900 | 0.00000024 | 0.00390625 | 
+| 3421x3146 | 64 | 0.0925 | 0.1168 | 0.79 | 0.00017142 | 0.00000012 | 0.00390625 | 
+| 7000x7000 | 64 | 0.0909 | 0.5581 | 0.16 | 0.00024140 | 0.00000024 | 0.00390625 | 
+| 7000x7000 | 128 | 0.0939 | 0.5360 | 0.18 | 0.00025988 | 0.00000024 | 0.00390625 | 
+| 2048x2048 | 64 | 0.0910 | 0.0456 | 2.00 | 0.00022352 | 0.00000018 | 0.00390625 | 
+| 6578x1236 | 128 | 0.0876 | 0.0832 | 1.05 | 0.00026894 | 0.00000024 | 0.00390625 | 
+| 1024x1024 | 128 | 0.0912 | 0.0137 | 6.66 | 0.00024605 | 0.00000024 | 0.00390625 | 
+| 256x256 | 128 | 0.1064 | 0.0077 | 13.82 | 0.00024271 | 0.00000024 | 0.00390625 | 
+| 8192x8192 | 64 | 0.0918 | 0.7726 | 0.12 | 0.00017440 | 0.00000012 | 0.00390625 | 
+| 4096x4096 | 64 | 0.0891 | 0.1832 | 0.49 | 0.00018179 | 0.00000018 | 0.00390625 | 
+| 8192x8192 | 128 | 0.0912 | 0.7425 | 0.12 | 0.00022542 | 0.00000018 | 0.00390625 | 
+| 1024x1024 | 64 | 0.0960 | 0.0145 | 6.62 | 0.00028801 | 0.00000030 | 0.00390625 | 
+| 6578x1236 | 64 | 0.0895 | 0.0867 | 1.03 | 0.00022066 | 0.00000018 | 0.00390625 | 
+| 16384x16384 | 64 | 0.3074 | 3.3083 | 0.09 | 0.00019693 | 0.00000018 | 0.00390625 | 
+| 512x512 | 128 | 0.0977 | 0.0078 | 12.53 | 0.00029063 | 0.00000030 | 0.00390625 | 
+| 4096x4096 | 128 | 0.0905 | 0.1756 | 0.52 | 0.00020349 | 0.00000018 | 0.00390625 | 
+| 3421x3146 | 128 | 0.0904 | 0.1103 | 0.82 | 0.00026965 | 0.00000024 | 0.00390625 | 
+| 16384x16384 | 128 | 0.2973 | 3.1883 | 0.09 | 0.00022352 | 0.00000018 | 0.00390625 | 
+
+# 未来可继续提升的地方
+解决v7版本存在的bank conflicts,v7版本在国产GPU上执行较差，找到原因解决。
\ No newline at end of file
diff --git "a/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf" "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf"
new file mode 100644
index 0000000..9fbec8f
Binary files /dev/null and "b/03_nf4_dequant/ayepei/\346\200\273\347\273\223\346\212\245\345\221\212.pdf" differ