文章目錄
- 前言
- 一、demo的輸入文件
- 二、trace_csv
- 三、2個值得注意的點
- 3.1 csv指令表格里面的tmask?
- 3.2 rtlsim和simx的log文件?
- 總結
前言
跟著前面那篇最后留下的幾個問題接著把輸出波形文件和csv文件的輸入、輸出搞明白!
一、demo的輸入文件
該文件夾下的內容包括:
dention@dention-virtual-machine:~/Desktop/vortex/vortex/tests/regression/demo$ tree
.
├── common.h
├── kernel.cpp
├── main.cpp
└── Makefile0 directories, 4 files
Makefile
的內容中規中矩,沒有很復雜的寫法!主要是完成了編譯的配置,其編譯的命令出現在../Makefile
內。
因此,核心文件就main.cpp
和kernel.cpp
:
kernel.cpp
如下:
#include <vx_spawn.h>
#include "common.h"void kernel_body(kernel_arg_t* __UNIFORM__ arg) {auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);uint32_t count = arg->task_size;uint32_t offset = blockIdx.x * count;for (uint32_t i = 0; i < count; ++i) {dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];}
}int main() {kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}
粗略看功能實現是:執行向量加法,將兩個源向量的元素相加并存儲到目標向量中。通過reinterpret_cast
將地址轉換為相應的指針類型,然后進行內存訪問和計算。這里的count
和offset
有點CUDA
的blockIdx
、thread
的感覺了。其中的vx_spawn_threads
是(見于/src/vx_spawn.c
):
int vx_spawn_threads(uint32_t dimension,const uint32_t* grid_dim,const uint32_t * block_dim,vx_kernel_func_cb kernel_func,const void* arg) {// calculate number of groups and group sizeuint32_t num_groups = 1;uint32_t group_size = 1;for (uint32_t i = 0; i < 3; ++i) {uint32_t gd = (grid_dim && (i < dimension)) ? grid_dim[i] : 1;uint32_t bd = (block_dim && (i < dimension)) ? block_dim[i] : 1;num_groups *= gd;group_size *= bd;gridDim.m[i] = gd;blockDim.m[i] = bd;}
這一段與CUDA
編程的相似度可能更加明顯,不同于NVIDIA GPU
,這段用在Vortex平臺
上啟動并行線程。根據給定的grid
和block
維度,計算線程組的數量和大小,并啟動內核函數。
然后是main.cpp
的測試程序:
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"#define FLOAT_ULP 6#define RT_CHECK(_expr) \do { \int _ret = _expr; \if (0 == _ret) \break; \printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \cleanup(); \exit(-1); \} while (false)///template <typename Type>
class Comparator {};template <>
class Comparator<int> {
public:static const char* type_str() {return "integer";}static int generate() {return rand();}static bool compare(int a, int b, int index, int errors) {if (a != b) {if (errors < 100) {printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);}return false;}return true;}
};template <>
class Comparator<float> {
private:union Float_t { float f; int i; };
public:static const char* type_str() {return "float";}static int generate() {return static_cast<float>(rand()) / RAND_MAX;}static bool compare(float a, float b, int index, int errors) {union fi_t { float f; int32_t i; };fi_t fa, fb;fa.f = a;fb.f = b;auto d = std::abs(fa.i - fb.i);if (d > FLOAT_ULP) {if (errors < 100) {printf("*** error: [%d] expected=%f(0x%x), actual=%f(0x%x), ulp=%d\n", index, b, fb.i, a, fa.i, d);}return false;}return true;}
};const char* kernel_file = "kernel.vxbin";
uint32_t count = 16;vx_device_h device = nullptr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};static void show_usage() {std::cout << "Vortex Test." << std::endl;std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}static void parse_args(int argc, char **argv) {int c;while ((c = getopt(argc, argv, "n:k:h?")) != -1) {switch (c) {case 'n':count = atoi(optarg);break;case 'k':kernel_file = optarg;break;case 'h':case '?': {show_usage();exit(0);} break;default:show_usage();exit(-1);}}
}void cleanup() {if (device) {vx_mem_free(src0_buffer);vx_mem_free(src1_buffer);vx_mem_free(dst_buffer);vx_mem_free(krnl_buffer);vx_mem_free(args_buffer);vx_dev_close(device);}
}int main(int argc, char *argv[]) {// parse command argumentsparse_args(argc, argv);std::srand(50);// open device connectionstd::cout << "open device connection" << std::endl;RT_CHECK(vx_dev_open(&device));uint64_t num_cores, num_warps, num_threads;RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));uint32_t total_threads = num_cores * num_warps * num_threads;uint32_t num_points = count * total_threads;uint32_t buf_size = num_points * sizeof(TYPE);std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;std::cout << "number of points: " << num_points << std::endl;std::cout << "buffer size: " << buf_size << " bytes" << std::endl;kernel_arg.num_tasks = total_threads;kernel_arg.task_size = count;// allocate device memorystd::cout << "allocate device memory" << std::endl;RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;// allocate host buffersstd::cout << "allocate host buffers" << std::endl;std::vector<TYPE> h_src0(num_points);std::vector<TYPE> h_src1(num_points);std::vector<TYPE> h_dst(num_points);// generate source datafor (uint32_t i = 0; i < num_points; ++i) {h_src0[i] = Comparator<TYPE>::generate();h_src1[i] = Comparator<TYPE>::generate();}// upload source buffer0std::cout << "upload source buffer0" << std::endl;RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));// upload source buffer1std::cout << "upload source buffer1" << std::endl;RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));// upload programstd::cout << "upload program" << std::endl;RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));// upload kernel argumentstd::cout << "upload kernel argument" << std::endl;RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));// start devicestd::cout << "start device" << std::endl;RT_CHECK(vx_start(device, krnl_buffer, args_buffer));// wait for completionstd::cout << "wait for completion" << std::endl;RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));// download destination bufferstd::cout << "download destination buffer" << std::endl;RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));// verify resultstd::cout << "verify result" << std::endl;int errors = 0;for (uint32_t i = 0; i < num_points; ++i) {auto ref = h_src0[i] + h_src1[i];auto cur = h_dst[i];if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {++errors;}}// cleanupstd::cout << "cleanup" << std::endl;cleanup();if (errors != 0) {std::cout << "Found " << std::dec << errors << " errors!" << std::endl;std::cout << "FAILED!" << std::endl;return errors;}std::cout << "PASSED!" << std::endl;return 0;
}
大致介紹其功能:
1、首先通過parse_args
函數解析命令行參數,設置每個任務的count
和內核文件路徑kernel_file
。
2、通過vx_dev_open
打開Vortex
設備連接,并使用vx_dev_caps
獲取設備的內核數量、線程數量等信息,進而根據count
和數據類型大小計算緩沖區大小。
3、在內存分配階段,程序通過vx_mem_alloc
分配設備內存用于存儲源數據和目標數據,并通過vx_mem_address
獲取設備內存地址。
4、隨后,分配主機緩沖區,并通過vx_copy_to_dev
將源數據上傳到設備內存。
5、在內核啟動階段,通過vx_upload_kernel_file
上傳內核文件,通過vx_upload_bytes
上傳內核參數,然后調用vx_start
啟動設備執行內核函數,并通過vx_ready_wait
等待內核函數執行完成。
6、在結果下載和驗證階段,通過vx_copy_from_dev
將目標數據下載到主機緩沖區,并使用Comparator<TYPE>::compare
驗證目標數據是否正確。
7、最后,釋放分配的資源并關閉設備連接。
二、trace_csv
主要分為這么幾個模塊:
parse_args:參數解析,包括了-t(--type)用于指定類型,可選包括rtlsim或者simx;包括-o(--csv),默認輸出trace.csv;包括log參數,表示輸入日志文件名。load_config:加載日志文件。parse_simx和parse_rtlsim:日志解析函數。
# parse_simx:解析simx類型的日志,并使用正則表達式匹配日志中的關鍵信息(如PC、core_id、warp_id、instr等)。
# parse_rtlsim:解析rtlsim類型的日志,并使用正則表達式匹配日志中的關鍵信息(如PC、core_id、warp_id、instr、opcode等),根據日志的階段(decode、issue、commit),逐步提取和更新指令信息。write_csv以及其他:輔助寫入csv或者數據格式處理的函數。
三、2個值得注意的點
3.1 csv指令表格里面的tmask?
csv
里面的tmask
的值要么就是1111
和1000
,但很明顯出現了J
型指令,所以不可能不出現thread mask
為其他值的情況!
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line_csv.sh trace_rtlsim.csv
tmask=tmask: 1
tmask=1111: 3275
tmask=1000: 402
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line_csv.sh trace_simx.csv
tmask=tmask: 1
tmask=1111: 3275
tmask=1000: 402
該shell
程序如下:
#!/bin/bashif [ -z "$1" ]; thenecho "Usage: $0 <filename>"exit 1
fifilename="$1"if [ ! -f "$filename" ]; thenecho "Error: File '$filename' not found."exit 1
fitmask_values=("0000" "0001" "0010" "0011" "0100" "0101" "0110" "0111" "1000" "1001" "1010" "1011" "1100" "1101" "1110" "1111")declare -A countsfor tmask in "${tmask_values[@]}"; docounts[$tmask]=0
doneawk -F, 'NR==1 {for(i=1;i<=NF;i++) if($i=="tmask") col=i} col {counts[$col]++} END {for(tmask in counts) print "tmask="tmask": "counts[tmask]}' "$filename"
翻了翻,發現某個run.log
里面的tmask
除了1111
和1000
,還包含其他值。
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line.sh rtlsim_run.log
tmask=0000: 1
tmask=0001: 2583
tmask=0010: 572
tmask=0011: 1
tmask=0100: 574
tmask=1000: 572
tmask=1010: 2
tmask=1011: 1
tmask=1100: 2
tmask=1111: 15804
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line.sh simx_run.log
tmask=1000: 402
tmask=1111: 3275
計數的shell
程序如下:
#!/bin/bashif [ -z "$1" ]; thenecho "Usage: $0 <filename>"exit 1
fifilename="$1"if [ ! -f "$filename" ]; thenecho "Error: File '$filename' not found."exit 1
fitmask_values=("0000" "0001" "0010" "0011" "0100" "0101" "0110" "0111" "1000" "1001" "1010" "1011" "1100" "1101" "1110" "1111")for tmask in "${tmask_values[@]}"; docount=$(grep -c "tmask=$tmask" "$filename")if [ $count -gt 0 ]; thenecho "tmask=$tmask: $count"fi
done
看樣子,為了看明白架構設計,tmask
部分得仔細看rtlsim_run.log
。
3.2 rtlsim和simx的log文件?
除了tmask
上的差異外,還包括若干點:
粘貼一部分simx_run.log
的開頭和結尾內容:
# 開頭
make: Entering directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
LD_LIBRARY_PATH=/home/dention/Desktop/vortex/vortex/build/runtime: VORTEX_DRIVER=simx ./demo -n64
open device connection
CONFIGS: num_threads=4, num_warps=4, num_cores=1, num_clusters=1, socket_size=1, local_mem_base=0xffff0000, num_barriers=2
[VXDRV] DEV_OPEN: hdevice=0x588eaa0c6b00
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x1, value=0x80000000
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x2, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x3, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x4, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x5, value=0x0
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=2, value=4
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=1, value=4
data type: integer
number of points: 1024
buffer size: 4096 bytes
allocate device memory
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x1, hbuffer=0x588eaa1d2fe0
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d2fe0, address=0x10000
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x1, hbuffer=0x588eaa1d30d0
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d30d0, address=0x11000
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x2
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x2, hbuffer=0x588eaa1d3180
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d3180, address=0x12000
dev_src0=0x10000
dev_src1=0x11000
dev_dst=0x12000
allocate host buffers
upload source buffer0
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1d2fe0, host_addr=0x588eaa1d31a0, dst_offset=0, size=4096
upload source buffer1
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1d30d0, host_addr=0x588eaa1d41b0, dst_offset=0, size=4096
upload program
[VXDRV] [RT:mem_reserve] addr: 0x80000000, asize:0x8000, size: 0x724c
[VXDRV] MEM_RESERVE: hdevice=0x588eaa0c6b00, address=0x80000000, size=29260, flags=0x0, hbuffer=0x588eaa1da520
[VXDRV] MEM_ACCESS: hbuffer=0x588eaa1da520, offset=0, size=29232, flags=1
[VXDRV] MEM_ACCESS: hbuffer=0x588eaa1da520, offset=29232, size=28, flags=3
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1da520, host_addr=0x588eaa1dc3d0, dst_offset=0, size=29232
upload kernel argument
[VXDRV] [RT:mem_alloc] size: 0x20, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=32, flags=0x1, hbuffer=0x588eaa1db7a0
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1db7a0, host_addr=0x588e9665b1a0, dst_offset=0, size=32
start device
[VXDRV] START: hdevice=0x588eaa0c6b00, hkernel=0x588eaa1da520, harguments=0x588eaa1db7a0
wait for completion
[VXDRV] READY_WAIT: hdevice=0x588eaa0c6b00, timeout=86400000
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000000 (#0)
DEBUG Instr 0xfc1022f3: CSRRS x5, x0, 0xfc1
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000004 (#1)
DEBUG Instr 0x317: AUIPC x6, 0x0
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000008 (#2)
DEBUG Instr 0x15c30313: ADDI x6, x6, 0x15c
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x8000000c (#3)
DEBUG Instr 0x62900b: WSPAWN x5, x6
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000010 (#4)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=1, tmask=1000, PC=0x80000160 (#4294967296)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=2, tmask=1000, PC=0x80000160 (#8589934592)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=3, tmask=1000, PC=0x80000160 (#12884901888)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000014 (#5)
DEBUG Instr 0x2800b: TMC x5
DEBUG Fetch: cid=0, wid=0, tmask=1111, PC=0x80000018 (#6)
DEBUG Instr 0x118000ef: JAL x1, 0x118
DEBUG Fetch: cid=0, wid=1, tmask=1000, PC=0x80000164 (#4294967297)
DEBUG Instr 0x2800b: TMC x5# 結尾
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000124 (#1196)
DEBUG Instr 0xa2a023: SW x5, x10, 0x0
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000128 (#1197)
DEBUG Instr 0xff0000f: FENCE 0xff
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x8000012c (#1198)
DEBUG Instr 0xb: TMC x0
download destination buffer
[VXDRV] COPY_FROM_DEV: hbuffer=0x588eaa1d3180, host_addr=0x588eaa1d51c0, src_offset=0, size=4096
verify result
cleanup
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d2fe0
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d30d0
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d3180
[VXDRV] MEM_FREE: hbuffer=0x588eaa1da520
[VXDRV] MEM_FREE: hbuffer=0x588eaa1db7a0
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=7, value=220126515488
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=8, value=2
[VXDRV] MPM_QUERY: hdevice=0x588eaa0c6b00, addr=0xb00, core_id=0, value=0x51e5
[VXDRV] MPM_QUERY: hdevice=0x588eaa0c6b00, addr=0xb02, core_id=0, value=0x343c
PERF: instrs=13372, cycles=20965, IPC=0.637825
[VXDRV] DEV_CLOSE: hdevice=0x588eaa0c6b00
PASSED!
make: Leaving directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
再粘貼點rtlsim_run.log
的開頭內容:
# 開頭
make: Entering directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
LD_LIBRARY_PATH=/home/dention/Desktop/vortex/vortex/build/runtime: VORTEX_DRIVER=rtlsim ./demo -n64
open device connection
CONFIGS: num_threads=4, num_warps=4, num_cores=1, num_clusters=1, socket_size=1, local_mem_base=0xffff0000, num_barriers=21: cluster0-socket0-core0-commit: wid=0, PC=0x5f107a52, ex=ALU, tmask=0001, wb=0, rd=3, sop=1, eop=1, data={0xb7565521, 0x9bc48e7e, 0xa32ad15d, 0x9389602e} (#13683927122444)3: cluster0-socket0-core0-fetch req: wid=3, PC=0xe433879e, tmask=0011 (#12884901888)3: cluster0-socket0-core0-fetch rsp: wid=3, PC=0x71e4448, tmask=1100, instr=0x349268a (#17094256310868)3: cluster0-socket0-core0-commit: wid=1, PC=0x52773234, ex=ALU, tmask=0100, wb=1, rd=36, sop=0, eop=1, data={0x4b5e9eb4, 0x2caf1927, 0xfe7aa429, 0xf3fe8f6c} (#12548972853897)3: cluster0-socket0-core0-issue0: wid=2, PC=0xe48211bc, ex=LSU, op=?, tmask=1111, wb=0, rd=19, rs1_data={0x9836776f, 0xc8641052, 0xd0a1def8, 0x6cc767e9}, rs2_data={0x723c9207, 0x31d09f01, 0xd7686059, 0xeb86a82f}, rs3_data={0xb75bff11, 0xd2952bed, 0x5f988735, 0x302f3caa}, offset=0x1b1 (#2627431648627)3: cluster0-socket0-core0-decode: wid=3, PC=0x71e4448, instr=0x349268a, ex=ALU, op=MULW, tmask=1100, wb=0, rd=0, rs1=0, rs2=0, rs3=0, opds=0000, use_PC=0, use_imm=0, imm=0xad7bb945 (#17094256310868)5: cluster0-socket0-core0-commit: wid=3, PC=0x1ac94c6, ex=ALU, tmask=1010, wb=0, rd=17, sop=0, eop=0, data={0x622ca31d, 0x3c0cd2a3, 0xdf57daf0, 0x4a2db2c0} (#1048219273081)5: cluster0-socket0-core0-issue0: wid=0, PC=0xe63d2e26, ex=LSU, op=SB, tmask=0000, wb=0, rd=48, rs1_data={0x192f8f69, 0x44286d54, 0x9ea4bb06, 0xd50866bc}, rs2_data={0xc96de59a, 0x84899e44, 0xf711e234, 0x6a373f12}, rs3_data={0x56c1c3b6, 0x4b37021d, 0x80b8d4f3, 0xbd771324}, offset=0x654 (#11721782109298)7: cluster0-socket0-core0-fetch req: wid=3, PC=0xe43387a2, tmask=1010 (#12884901889)7: cluster0-socket0-core0-commit: wid=2, PC=0xc93aa, ex=ALU, tmask=0100, wb=1, rd=43, sop=0, eop=1, data={0xc0967ef, 0x20438730, 0x291eaa1d, 0xa71be0fe} (#5228876286358)7: cluster0-socket0-core0-issue0: wid=1, PC=0xea70b098, ex=FPU, op=FSQRT.S, tmask=1011, wb=1, rd=56, rs1_data={0x7e4a3e0b, 0xd9f00800, 0xc6c9ee49, 0xfc9a9d2d}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x603da36e, 0x8be4ac57, 0x1dbfe827, 0x5f107de0}, fmt=0x0, frm=0x3 (#4099303258324)
[VXDRV] DEV_OPEN: hdevice=0x57d9ef008d90
[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x1, value=0x80000000
STARTUP_ADDR0[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x2, value=0x0
STARTUP_ADDR1[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x3, value=0x0
STARTUP_ARG0[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x4, value=0x0
STARTUP_ARG1[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x5, value=0x0
MPM_CLASS[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=2, value=4
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=1, value=4
data type: integer
number of points: 1024
buffer size: 4096 bytes
allocate device memory
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x1, hbuffer=0x57d9ef13e4b0
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e4b0, address=0x10000
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x1, hbuffer=0x57d9ef13e510
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e510, address=0x11000
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x2, hbuffer=0x57d9ef13e5c0
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e5c0, address=0x12000
dev_src0=0x10000
dev_src1=0x11000
dev_dst=0x12000
allocate host buffers
upload source buffer0
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef13e4b0, host_addr=0x57d9ef13e5e0, dst_offset=0, size=4096
upload source buffer1
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef13e510, host_addr=0x57d9ef13f5f0, dst_offset=0, size=4096
upload program
[VXDRV] MEM_RESERVE: hdevice=0x57d9ef008d90, address=0x80000000, size=29260, flags=0x0, hbuffer=0x57d9ef14cb80
[VXDRV] MEM_ACCESS: hbuffer=0x57d9ef14cb80, offset=0, size=29232, flags=1
[VXDRV] MEM_ACCESS: hbuffer=0x57d9ef14cb80, offset=29232, size=28, flags=3
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef14cb80, host_addr=0x57d9ef145870, dst_offset=0, size=29232
upload kernel argument
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=32, flags=0x1, hbuffer=0x57d9ef143970
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef143970, host_addr=0x57d9eb3fc1a0, dst_offset=0, size=32
start device
[VXDRV] START: hdevice=0x57d9ef008d90, hkernel=0x57d9ef14cb80, harguments=0x57d9ef143970
STARTUP_ADDR0STARTUP_ADDR1STARTUP_ARG0STARTUP_ARG1wait for completion
[VXDRV] READY_WAIT: hdevice=0x57d9ef008d90, timeout=86400000
52: [sim] run()77: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000000, tmask=0001 (#0)265: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000000, tmask=0001, instr=0xfc1022f3 (#0)265: cluster0-socket0-core0-decode: wid=0, PC=0x80000000, instr=0xfc1022f3, ex=SFU, op=CSRRS, tmask=0001, wb=1, rd=5, rs1=0, rs2=0, rs3=0, opds=1100, addr=0xfc1, use_imm=0, imm=0x5 (#0)269: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000004, tmask=0001 (#1)277: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000004, tmask=0001, instr=0x317 (#1)277: cluster0-socket0-core0-issue0: wid=0, PC=0x80000000, ex=SFU, op=CSRRS, tmask=0001, wb=1, rd=5, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, addr=0xfc1, use_imm=0, imm=0x5 (#0)277: cluster0-socket0-core0-decode: wid=0, PC=0x80000004, instr=0x317, ex=ALU, op=AUIPC, tmask=0001, wb=1, rd=6, rs1=0, rs2=0, rs3=0, opds=1000, use_PC=1, use_imm=1, imm=0x0 (#1)281: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000008, tmask=0001 (#2)287: cluster0-socket0-core0-commit: wid=0, PC=0x80000000, ex=SFU, tmask=0001, wb=1, rd=5, sop=1, eop=1, data={0x4, 0x4, 0x4, 0x4} (#0)289: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000008, tmask=0001, instr=0x15c30313 (#2)289: cluster0-socket0-core0-issue0: wid=0, PC=0x80000004, ex=ALU, op=AUIPC, tmask=0001, wb=1, rd=6, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, use_PC=1, use_imm=1, imm=0x0 (#1)289: cluster0-socket0-core0-decode: wid=0, PC=0x80000008, instr=0x15c30313, ex=ALU, op=ADDI, tmask=0001, wb=1, rd=6, rs1=6, rs2=0, rs3=0, opds=1100, use_PC=0, use_imm=1, imm=0x15c (#2)293: cluster0-socket0-core0-fetch req: wid=0, PC=0x8000000c, tmask=0001 (#3)295: cluster0-socket0-core0-commit: wid=0, PC=0x80000004, ex=ALU, tmask=0001, wb=1, rd=6, sop=1, eop=1, data={0x80000004, 0x80000004, 0x80000004, 0x80000004} (#1)301: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x8000000c, tmask=0001, instr=0x62900b (#3)301: cluster0-socket0-core0-decode: wid=0, PC=0x8000000c, instr=0x62900b, ex=SFU, op=WSPAWN, tmask=0001, wb=0, rd=0, rs1=5, rs2=6, rs3=0, opds=0110 (#3)307: cluster0-socket0-core0-issue0: wid=0, PC=0x80000008, ex=ALU, op=ADDI, tmask=0001, wb=1, rd=6, rs1_data={0x0, 0x0, 0x0, 0x80000004}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, use_PC=0, use_imm=1, imm=0x15c (#2)313: cluster0-socket0-core0-commit: wid=0, PC=0x80000008, ex=ALU, tmask=0001, wb=1, rd=6, sop=1, eop=1, data={0x15c, 0x15c, 0x15c, 0x80000160} (#2)325: cluster0-socket0-core0-issue0: wid=0, PC=0x8000000c, ex=SFU, op=WSPAWN, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x4}, rs2_data={0x0, 0x0, 0x0, 0x80000160}, rs3_data={0x0, 0x0, 0x0, 0x0} (#3)335: cluster0-socket0-core0-commit: wid=0, PC=0x8000000c, ex=SFU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#3)337: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000010, tmask=0001 (#4)# 結尾43883: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x8000012c, tmask=0001, instr=0xb (#1198)43883: cluster0-socket0-core0-issue0: wid=0, PC=0x80000128, ex=LSU, op=FENCE, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, offset=0x0 (#1197)43883: cluster0-socket0-core0-decode: wid=0, PC=0x8000012c, instr=0xb, ex=SFU, op=TMC, tmask=0001, wb=0, rd=0, rs1=0, rs2=0, rs3=0, opds=0100 (#1198)43887: cluster0-socket0-core0-commit: wid=0, PC=0x80000124, ex=LSU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#1196)43895: cluster0-socket0-core0-issue0: wid=0, PC=0x8000012c, ex=SFU, op=TMC, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0} (#1198)43905: cluster0-socket0-core0-commit: wid=0, PC=0x8000012c, ex=SFU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#1198)44093: cluster0-socket0-core0-commit: wid=0, PC=0x80000128, ex=LSU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0xf58212ff, 0xc1377584, 0x1a1de1da, 0xb1e383e0} (#1197)
download destination buffer
[VXDRV] COPY_FROM_DEV: hbuffer=0x57d9ef13e5c0, host_addr=0x57d9ef140600, src_offset=0, size=4096
verify result
cleanup
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e4b0
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e510
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e5c0
[VXDRV] MEM_FREE: hbuffer=0x57d9ef14cb80
[VXDRV] MEM_FREE: hbuffer=0x57d9ef143970
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=7, value=220126515488
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=8, value=2
[VXDRV] MPM_QUERY: hdevice=0x57d9ef008d90, addr=0xb00, core_id=0, value=0x5114
[VXDRV] MPM_QUERY: hdevice=0x57d9ef008d90, addr=0xb02, core_id=0, value=0x3440
PERF: instrs=13376, cycles=20756, IPC=0.644440
[VXDRV] DEV_CLOSE: hdevice=0x57d9ef008d90
PASSED!
make: Leaving directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
這倆log
文件的差異確實很大,也能看得出來基于模擬器的方案和基于rtl
的方案存在多個方面的差異!
simx_run.log:
1、日志文件從編譯和運行命令開始,記錄了程序的啟動過程。
2、包含設備配置信息(CONFIGS)。
3、詳細記錄了設備內存分配、數據上傳、程序上傳、設備啟動、指令執行等過程。
4、包含指令執行的詳細信息,如 DEBUG Fetch、DEBUG Instr 等。rtlsim_run.log:
1、日志文件同樣從編譯和運行命令開始,記錄了程序的啟動過程。同前
2、包含設備配置信息(CONFIGS)。同前
3、詳細記錄了設備內存分配、數據上傳、程序上傳、設備啟動、指令執行等過程。同前
4、包含指令執行的詳細信息,如 cluster0-socket0-core0-fetch req、cluster0-socket0-core0-fetch rsp 等。僅標志不同!
總結
對上一篇遺留的幾個任務進行了回答,順帶梳理了2種log的差異,為后續繼續解讀rtl代碼做了大量準備工作。后面就一點一點慢慢看吧,任重而道遠!