From a72a4514f3d80db9daffdd7d73053c1d0272ff13 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Sun, 18 Mar 2018 19:54:08 -0700 Subject: [PATCH] [REFACTOR] Macro standardization, lint tests (#7) * code refactoring * code refactoring * code refactoring * code refactoring * fixing macro * refactoring, tests, makefile * style - making sure lint test pass * prefixed macros with VTA, fixed bugs --- vta/Makefile | 2 +- vta/hardware/vivado/Makefile | 31 +- vta/hardware/vivado/scripts/hls.tcl | 12 +- vta/hardware/vivado/sim/vta_test.cc | 61 +- vta/hardware/vivado/src/vta.cc | 633 ++++++++--------- vta/hardware/vivado/src/vta.h | 117 ++- vta/include/vta/driver.h | 10 +- vta/include/vta/hw_spec.h | 353 ++++----- vta/make/config.mk | 62 +- vta/src/pynq/pynq_driver.cc | 34 +- vta/src/pynq/pynq_driver.h | 32 +- vta/src/runtime.cc | 195 ++--- vta/src/tvm/vta_device_api.cc | 10 +- vta/tests/hardware/common/test_lib.cc | 985 ++++++++++++++------------ vta/tests/hardware/common/test_lib.h | 16 +- vta/tests/hardware/pynq/metal_test.cc | 245 ++++--- 16 files changed, 1412 insertions(+), 1386 deletions(-) diff --git a/vta/Makefile b/vta/Makefile index 6007ed2eeaef..74c23d691c66 100644 --- a/vta/Makefile +++ b/vta/Makefile @@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ) lint: pylint cpplint cpplint: - python nnvm/dmlc-core/scripts/lint.py vta cpp include src + python nnvm/dmlc-core/scripts/lint.py vta cpp include src hardware tests pylint: pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc diff --git a/vta/hardware/vivado/Makefile b/vta/hardware/vivado/Makefile index dfcb06316e4d..f3d779ee2a73 100644 --- a/vta/hardware/vivado/Makefile +++ b/vta/hardware/vivado/Makefile @@ -1,6 +1,6 @@ # Directories ROOTDIR = $(CURDIR) -BUILD_DIR = $(ROOTDIR)/build +BUILD_DIR = $(ROOTDIR)/../../build/hardware/vivado SCRIPT_DIR = $(ROOTDIR)/scripts SRC_DIR = $(ROOTDIR)/src SIM_DIR = $(ROOTDIR)/sim @@ -27,20 +27,21 @@ include $(config) #-------------------- # Number of threads during compilation -NUM_THREADS = 8 +VTA_HW_COMP_THREADS = 8 # Target Frequency -CLOCK_FREQ = 100 +VTA_HW_COMP_CLOCK_FREQ = 100 # Timing closure compensation (0 for none, 3 for highest) -TIMING_CLOSURE_COMP = 0 +VTA_HW_COMP_TIMING_COMP = 0 # Derive clock target period -TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" ) +TARGET_PER = \ +$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" ) # Derive config name CONF = \ - $(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns +$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF) HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF) @@ -53,23 +54,23 @@ ip: cd $(IP_BUILD_PATH) && \ $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \ - $(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \ - $(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \ - $(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \ - $(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE) + $(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \ + $(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \ + $(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \ + $(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) bit: ip mkdir -p $(HW_BUILD_PATH) cd $(HW_BUILD_PATH) && \ $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ - -tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \ - $(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \ - $(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \ - $(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE) + -tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \ + $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(OUT_WIDTH) \ + $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ + $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) driver: bit cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog cd $(HW_BUILD_PATH)/bsp && make clean: - rm -rf build \ No newline at end of file + rm -rf $(BUILD_DIR) \ No newline at end of file diff --git a/vta/hardware/vivado/scripts/hls.tcl b/vta/hardware/vivado/scripts/hls.tcl index 220c8f3ba3bf..67ce742bf47a 100644 --- a/vta/hardware/vivado/scripts/hls.tcl +++ b/vta/hardware/vivado/scripts/hls.tcl @@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } { # C define flags to pass to compiler set cflags "-I $include_dir -I $src_dir -I $test_dir \ - -DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \ - -DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \ - -DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \ - -DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \ - -DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \ - -DLOG_OUT_BUFF_SIZE=$out_buff_size" + -DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \ + -DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \ + -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \ + -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \ + -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \ + -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size" # Initializes the HLS design and sets HLS pragmas for memory partitioning. # This is necessary because of a Vivado restriction that doesn't allow for diff --git a/vta/hardware/vivado/sim/vta_test.cc b/vta/hardware/vivado/sim/vta_test.cc index 2031186f31ce..16f37a866464 100644 --- a/vta/hardware/vivado/sim/vta_test.cc +++ b/vta/hardware/vivado/sim/vta_test.cc @@ -11,52 +11,49 @@ #include "../src/vta.h" #include "../../../tests/hardware/common/test_lib.h" -int main(void) -{ - -#if DEBUG==1 +int main(void) { +#if DEBUG == 1 printParameters(); #endif // Buffer indexing - assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH); + assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH); // Micro op bound - assert(UOP_GEM_3_1 &load_queue, - hls::stream &gemm_queue, - hls::stream &store_queue) { -#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS -#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port -#pragma HLS INTERFACE axis port=load_queue -#pragma HLS INTERFACE axis port=gemm_queue -#pragma HLS INTERFACE axis port=store_queue -#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS - - INSN_DECODE: for (int pc = 0; pc < insn_count; pc ++) { -#pragma HLS PIPELINE II=1 + hls::stream *load_queue, + hls::stream *gemm_queue, + hls::stream *store_queue) { +#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS +#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port +#pragma HLS INTERFACE axis port = load_queue +#pragma HLS INTERFACE axis port = gemm_queue +#pragma HLS INTERFACE axis port = store_queue +#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS + + INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) { +#pragma HLS PIPELINE II = 1 // Read instruction fields insn_T insn = insns[pc]; // Do some partial decoding - opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0); - memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); + opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0); + memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); // Push to appropriate instruction queue - if (opcode == OPCODE_STORE) { - store_queue.write(insn); - } else if (opcode == OPCODE_LOAD && - (memory_type == MEM_ID_INP || memory_type == MEM_ID_WGT)) { - load_queue.write(insn); + if (opcode == VTA_OPCODE_STORE) { + store_queue->write(insn); + } else if (opcode == VTA_OPCODE_LOAD && + (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) { + load_queue->write(insn); } else { - gemm_queue.write(insn); + gemm_queue->write(insn); } } - } -void load ( +void load( volatile inp_vec_T *inputs, volatile wgt_vec_T *weights, - hls::stream &load_queue, - hls::stream &g2l_dep_queue, - hls::stream &l2g_dep_queue, - inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], - wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT] + hls::stream *load_queue, + hls::stream *g2l_dep_queue, + hls::stream *l2g_dep_queue, + inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], + wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT] ) { -#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port -#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port -#pragma HLS INTERFACE axis port=load_queue -#pragma HLS INTERFACE axis port=g2l_dep_queue -#pragma HLS INTERFACE axis port=l2g_dep_queue -#pragma HLS INTERFACE bram port=wgt_mem -#pragma HLS INTERFACE bram port=inp_mem -#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS -// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2 +#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port +#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port +#pragma HLS INTERFACE axis port = load_queue +#pragma HLS INTERFACE axis port = g2l_dep_queue +#pragma HLS INTERFACE axis port = l2g_dep_queue +#pragma HLS INTERFACE bram port = wgt_mem +#pragma HLS INTERFACE bram port = inp_mem +#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS // Pop load instruction - insn_T insn = load_queue.read(); + insn_T insn = load_queue->read(); // Decode instruction - bool pop_prev_dependence = insn[INSN_MEM_1]; - bool pop_next_dependence = insn[INSN_MEM_2]; - bool push_prev_dependence = insn[INSN_MEM_3]; - bool push_next_dependence = insn[INSN_MEM_4]; - memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); - memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); - memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); + bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; + bool pop_next_dependence = insn[VTA_INSN_MEM_2]; + bool push_prev_dependence = insn[VTA_INSN_MEM_3]; + bool push_next_dependence = insn[VTA_INSN_MEM_4]; + memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); + memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); + memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); + memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); + memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); + memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); + memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); + memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); + memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); + memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); // Pop dependence token if instructed if (pop_next_dependence) { - g2l_dep_queue.read(); + g2l_dep_queue->read(); } // Initialize indices @@ -94,29 +92,26 @@ void load ( memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; memop_sram_T y_offset = x_size_total * y_pad_0; -#pragma HLS RESOURCE variable=y_offset core=Mul_LUT +// Force this computation to be done with LUTs to avoid using too many DSPs +#pragma HLS RESOURCE variable = y_offset core = Mul_LUT // Skip padding along y dimension sram_idx += y_offset; // Perform data transfer from DRAM - for (int y = 0; y < y_size; y ++) { + for (int y = 0; y < y_size; y++) { #pragma HLS PIPELINE rewind // Skip padding along x dimension sram_idx += x_pad_0; // Perform data transfer - if (memory_type == MEM_ID_INP) { - memcpy( - &inp_mem[sram_idx][0], - (const inp_vec_T*) &inputs[dram_idx * BATCH], - x_size * INP_ELEM_BYTES - ); + if (memory_type == VTA_MEM_ID_INP) { + memcpy(&inp_mem[sram_idx][0], + (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH], + x_size * VTA_INP_ELEM_BYTES); } else { - memcpy( - &wgt_mem[sram_idx][0], - (const wgt_vec_T*) &weights[dram_idx * BLOCK_OUT], - x_size * WGT_ELEM_BYTES - ); + memcpy(&wgt_mem[sram_idx][0], + (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT], + x_size * VTA_WGT_ELEM_BYTES); } sram_idx += x_size; dram_idx += x_stride; @@ -127,136 +122,130 @@ void load ( // Reset SRAM index sram_idx = sram_base; // Pad x/y edges with zeros - for (int y = 0; y < y_size_total; y ++) { + for (int y = 0; y < y_size_total; y++) { if (y < y_pad_0 || y >= y_pad_0 + y_size) { - for (int x = 0; x < x_size_total; x ++) { -#pragma HLS PIPELINE II=1 rewind - if (memory_type == MEM_ID_INP) { - for (int i = 0; i < BATCH; i ++) { + for (int x = 0; x < x_size_total; x++) { +#pragma HLS PIPELINE II = 1 rewind + if (memory_type == VTA_MEM_ID_INP) { + for (int i = 0; i < VTA_BATCH; i++) { inp_mem[sram_idx][i] = 0; } } else { - for (int i = 0; i < BLOCK_OUT; i ++) { + for (int i = 0; i < VTA_BLOCK_OUT; i++) { wgt_mem[sram_idx][i] = 0; } } - sram_idx ++; + sram_idx++; } } else { - for (int x = 0; x < x_pad_0; x ++) { -#pragma HLS PIPELINE II=1 rewind - if (memory_type == MEM_ID_INP) { - for (int i = 0; i < BATCH; i ++) { + for (int x = 0; x < x_pad_0; x++) { +#pragma HLS PIPELINE II = 1 rewind + if (memory_type == VTA_MEM_ID_INP) { + for (int i = 0; i < VTA_BATCH; i++) { inp_mem[sram_idx][i] = 0; } } else { - for (int i = 0; i < BLOCK_OUT; i ++) { + for (int i = 0; i < VTA_BLOCK_OUT; i++) { wgt_mem[sram_idx][i] = 0; } } - sram_idx ++; + sram_idx++; } sram_idx += x_size; - for (int x = 0; x < x_pad_1; x ++) { -#pragma HLS PIPELINE II=1 rewind - if (memory_type == MEM_ID_INP) { - for (int i = 0; i < BATCH; i ++) { + for (int x = 0; x < x_pad_1; x++) { +#pragma HLS PIPELINE II = 1 rewind + if (memory_type == VTA_MEM_ID_INP) { + for (int i = 0; i < VTA_BATCH; i++) { inp_mem[sram_idx][i] = 0; } } else { - for (int i = 0; i < BLOCK_OUT; i ++) { + for (int i = 0; i < VTA_BLOCK_OUT; i++) { wgt_mem[sram_idx][i] = 0; } } - sram_idx ++; + sram_idx++; } - } } // Push dependence token if instructed if (push_next_dependence) { - l2g_dep_queue.write(1); + l2g_dep_queue->write(1); } } -void compute ( - volatile uint32_t &done, +void compute( + volatile uint32_t *done, volatile uop_T *uops, volatile acc_vec_T *biases, - hls::stream &gemm_queue, - hls::stream &l2g_dep_queue, - hls::stream &s2g_dep_queue, - hls::stream &g2l_dep_queue, - hls::stream &g2s_dep_queue, - out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], - wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT], - out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] + hls::stream *gemm_queue, + hls::stream *l2g_dep_queue, + hls::stream *s2g_dep_queue, + hls::stream *g2l_dep_queue, + hls::stream *g2s_dep_queue, + out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], + wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT], + out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH] ) { -#pragma HLS INTERFACE s_axilite port=done bundle=CONTROL_BUS -#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port -#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port -#pragma HLS INTERFACE axis port=gemm_queue -#pragma HLS INTERFACE axis port=l2g_dep_queue -#pragma HLS INTERFACE axis port=s2g_dep_queue -#pragma HLS INTERFACE axis port=g2l_dep_queue -#pragma HLS INTERFACE axis port=g2s_dep_queue -#pragma HLS INTERFACE bram port=inp_mem -#pragma HLS INTERFACE bram port=wgt_mem -#pragma HLS INTERFACE bram port=out_mem -#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS -// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2 -// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2 +#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS +#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port +#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port +#pragma HLS INTERFACE axis port = gemm_queue +#pragma HLS INTERFACE axis port = l2g_dep_queue +#pragma HLS INTERFACE axis port = s2g_dep_queue +#pragma HLS INTERFACE axis port = g2l_dep_queue +#pragma HLS INTERFACE axis port = g2s_dep_queue +#pragma HLS INTERFACE bram port = inp_mem +#pragma HLS INTERFACE bram port = wgt_mem +#pragma HLS INTERFACE bram port = out_mem +#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS // This is necessary connect the SRAM to the load module -#pragma HLS RESOURCE variable=wgt_mem core=RAM_1P +#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P // Micro-op storage - static uop_T uop_mem[UOP_BUFF_DEPTH]; + static uop_T uop_mem[VTA_UOP_BUFF_DEPTH]; // Accumulator storage - static acc_vec_T acc_mem[ACC_BUFF_DEPTH][BATCH]; -#pragma HLS ARRAY_PARTITION variable=acc_mem complete dim=2 + static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]; +#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2 // Pop GEMM instruction - insn_T insn = gemm_queue.read(); + insn_T insn = gemm_queue->read(); // Decode - opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0); - bool pop_prev_dependence = insn[INSN_MEM_1]; - bool pop_next_dependence = insn[INSN_MEM_2]; - bool push_prev_dependence = insn[INSN_MEM_3]; - bool push_next_dependence = insn[INSN_MEM_4]; + opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0); + bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; + bool pop_next_dependence = insn[VTA_INSN_MEM_2]; + bool push_prev_dependence = insn[VTA_INSN_MEM_3]; + bool push_next_dependence = insn[VTA_INSN_MEM_4]; // Pop dependence token if instructed if (pop_prev_dependence) { - l2g_dep_queue.read(); + l2g_dep_queue->read(); } if (pop_next_dependence) { - s2g_dep_queue.read(); + s2g_dep_queue->read(); } // Perform action based on opcode - if (opcode == OPCODE_FINISH) { - + if (opcode == VTA_OPCODE_FINISH) { // Set done flag if we reach a FINISH instruction - done = 1; - - } else if (opcode == OPCODE_LOAD || opcode == OPCODE_STORE) { - + *done = 1; + } else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) { // Set done value - done = 0; + *done = 0; // Decode instruction - memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); - memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); - memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); + memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); + memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); + memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); + memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); + memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); + memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); + memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); + memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); + memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); + memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); // Initialize indices memop_sram_T sram_idx = sram_base; @@ -266,220 +255,202 @@ void compute ( memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; memop_sram_T y_offset = x_size_total * y_pad_0; -#pragma HLS RESOURCE variable=y_offset core=Mul_LUT +// Force this computation to be done with LUTs to avoid using too many DSPs +#pragma HLS RESOURCE variable = y_offset core = Mul_LUT - if (memory_type == MEM_ID_UOP) { + if (memory_type == VTA_MEM_ID_UOP) { // Perform data transfer - memcpy( - &uop_mem[sram_base], - (const uop_T*) &uops[dram_base], - x_size * sizeof(uop_T) - ); + memcpy(&uop_mem[sram_base], + (const uop_T*) &uops[dram_base], + x_size * sizeof(uop_T)); } else { // Skip vertical padding sram_idx += y_offset; // Perform data transfer from DRAM - for (int y = 0; y < y_size; y ++) { + for (int y = 0; y < y_size; y++) { #pragma HLS PIPELINE rewind // Skip padding along x dimension sram_idx += x_pad_0; // Perform data transfer - memcpy( - &acc_mem[sram_idx][0], - (const acc_vec_T*) &biases[dram_idx * BATCH], - x_size*ACC_ELEM_BYTES - ); + memcpy(&acc_mem[sram_idx][0], + (const acc_vec_T*) &biases[dram_idx * VTA_BATCH], + x_size*VTA_ACC_ELEM_BYTES); sram_idx += x_size; dram_idx += x_stride; // Skip padding along x dimension sram_idx += x_pad_1; } } - - } else if (opcode == OPCODE_GEMM || opcode == OPCODE_ALU) { - + } else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) { // Set done value - done = 0; + *done = 0; // Decode - uop_idx_T uop_bgn = insn.range(INSN_GEM_5_1, INSN_GEM_5_0); - uop_idx_T uop_end = insn.range(INSN_GEM_6_1, INSN_GEM_6_0); - loop_T iter_out = insn.range(INSN_GEM_7_1, INSN_GEM_7_0); - loop_T iter_in = insn.range(INSN_GEM_8_1, INSN_GEM_8_0); - acc_idx_T dst_factor_out = insn.range(INSN_GEM_9_1, INSN_GEM_9_0); - acc_idx_T dst_factor_in = insn.range(INSN_GEM_A_1, INSN_GEM_A_0); - inp_idx_T src_factor_out = insn.range(INSN_GEM_B_1, INSN_GEM_B_0); - inp_idx_T src_factor_in = insn.range(INSN_GEM_C_1, INSN_GEM_C_0); + uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_5_1, VTA_INSN_GEM_5_0); + uop_idx_T uop_end = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0); + loop_T iter_out = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0); + loop_T iter_in = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0); + acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0); + acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0); + inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0); + inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0); // GEMM-specific fields - wgt_idx_T wgt_factor_out = insn.range(INSN_GEM_D_1, INSN_GEM_D_0); - wgt_idx_T wgt_factor_in = insn.range(INSN_GEM_E_1, INSN_GEM_E_0); + wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0); + wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0); // ALU-specific field - aluop_opcode_T alu_opcode = insn.range(INSN_ALU_D_1, INSN_ALU_D_0); - bool use_imm = insn[INSN_ALU_E]; - aluop_imm_T imm = insn.range(INSN_ALU_F_1, INSN_ALU_F_0); - + aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_D_1, VTA_INSN_ALU_D_0); + bool use_imm = insn[VTA_INSN_ALU_E]; + aluop_imm_T imm = insn.range(VTA_INSN_ALU_F_1, VTA_INSN_ALU_F_0); acc_idx_T dst_offset_out = 0; inp_idx_T src_offset_out = 0; wgt_idx_T wgt_offset_out = 0; // Outer Loop - EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out ++) { -#pragma HLS DEPENDENCE variable=acc_mem inter false - + EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) { +#pragma HLS DEPENDENCE variable = acc_mem inter false acc_idx_T dst_offset_in = dst_offset_out; inp_idx_T src_offset_in = src_offset_out; wgt_idx_T wgt_offset_in = wgt_offset_out; // Inner Loop - EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in ++) { - + EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) { // Perform appropriate computation based on opcode - if (opcode == OPCODE_GEMM) { - + if (opcode == VTA_OPCODE_GEMM) { // Iterate over micro op - READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) { -#pragma HLS PIPELINE II=1 rewind + READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) { +#pragma HLS PIPELINE II = 1 rewind // Read micro-op fields uop_T uop = uop_mem[upc]; // Decode indices - bool reset_out = uop[UOP_GEM_0]; + bool reset_out = uop[VTA_UOP_GEM_0]; acc_idx_T dst_idx = - uop.range(UOP_GEM_1_1, UOP_GEM_1_0) + dst_offset_in; + uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + dst_offset_in; acc_idx_T src_idx = - uop.range(UOP_GEM_2_1, UOP_GEM_2_0) + src_offset_in; + uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + src_offset_in; wgt_idx_T wgt_idx = - uop.range(UOP_GEM_3_1, UOP_GEM_3_0) + wgt_offset_in; + uop.range(VTA_UOP_GEM_3_1, VTA_UOP_GEM_3_0) + wgt_offset_in; // Read weight matrix - wgt_vec_T w_matrix[BLOCK_OUT]; - for (int i = 0; i < BLOCK_OUT; i ++) { + wgt_vec_T w_matrix[VTA_BLOCK_OUT]; + for (int i = 0; i < VTA_BLOCK_OUT; i++) { w_matrix[i] = wgt_mem[wgt_idx][i]; } // Read input matrix and accum matrix - acc_vec_T o_matrix[BATCH]; - out_vec_T i_matrix[BATCH]; - for (int i = 0; i < BATCH; i ++) { + acc_vec_T o_matrix[VTA_BATCH]; + out_vec_T i_matrix[VTA_BATCH]; + for (int i = 0; i < VTA_BATCH; i++) { o_matrix[i] = acc_mem[dst_idx][i]; i_matrix[i] = inp_mem[src_idx][i]; } // Result matrices - acc_vec_T acc_mem_val[BATCH]; - out_vec_T st_buf_val[BATCH]; + acc_vec_T acc_mem_val[VTA_BATCH]; + out_vec_T st_buf_val[VTA_BATCH]; // Inner GEMM loop - for (int i = 0; i < BATCH; i ++) { - for (int b = 0; b < BLOCK_OUT; b ++) { + for (int i = 0; i < VTA_BATCH; i++) { + for (int b = 0; b < VTA_BLOCK_OUT; b++) { // Initialize the accumulator values acc_T accum = - o_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH); + o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH); // Dot product sum sum_T tmp = 0; // Inner matrix multiplication loop (input channel/feature) - for (int k=0; k> (aluop_sh_imm_T) src_1.range(LOG_ACC_WIDTH - 1, 0); - shr_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) = - shr_val; - short_shr_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) = - (inp_T) shr_val.range(INP_WIDTH-1, 0); + src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0); + shr_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val; + short_shr_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) = + (inp_T) shr_val.range(VTA_OUT_WIDTH-1, 0); } // Store to accum memory/store buffer - if (alu_opcode == ALU_OPCODE_MIN || - alu_opcode == ALU_OPCODE_MAX) { + if (alu_opcode == VTA_ALU_OPCODE_MIN || + alu_opcode == VTA_ALU_OPCODE_MAX) { acc_mem[dst_idx][i] = cmp_res[i]; out_mem[dst_idx][i] = short_cmp_res[i]; - } else if (alu_opcode==ALU_OPCODE_ADD) { + } else if (alu_opcode == VTA_ALU_OPCODE_ADD) { acc_mem[dst_idx][i] = add_res[i]; out_mem[dst_idx][i] = short_add_res[i]; - } else if (alu_opcode==ALU_OPCODE_SHR) { + } else if (alu_opcode == VTA_ALU_OPCODE_SHR) { acc_mem[dst_idx][i] = shr_res[i]; out_mem[dst_idx][i] = short_shr_res[i]; } @@ -502,51 +473,49 @@ void compute ( // Push dependence token if instructed if (push_prev_dependence) { - g2l_dep_queue.write(1); + g2l_dep_queue->write(1); } if (push_next_dependence) { - g2s_dep_queue.write(1); + g2s_dep_queue->write(1); } - } -void store ( +void store( volatile out_vec_T *outputs, - hls::stream &store_queue, - hls::stream &g2s_dep_queue, - hls::stream &s2g_dep_queue, - out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] + hls::stream *store_queue, + hls::stream *g2s_dep_queue, + hls::stream *s2g_dep_queue, + out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH] ) { -#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port -#pragma HLS INTERFACE axis port=store_queue -#pragma HLS INTERFACE axis port=g2s_dep_queue -#pragma HLS INTERFACE axis port=s2g_dep_queue -#pragma HLS INTERFACE bram port=out_mem -#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS -// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2 +#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port +#pragma HLS INTERFACE axis port = store_queue +#pragma HLS INTERFACE axis port = g2s_dep_queue +#pragma HLS INTERFACE axis port = s2g_dep_queue +#pragma HLS INTERFACE bram port = out_mem +#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS // Load buffer - insn_T insn = store_queue.read(); + insn_T insn = store_queue->read(); // Decode - bool pop_prev_dependence = insn[INSN_MEM_1]; - bool pop_next_dependence = insn[INSN_MEM_2]; - bool push_prev_dependence = insn[INSN_MEM_3]; - bool push_next_dependence = insn[INSN_MEM_4]; - memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); - memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); - memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); - memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); - memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); - memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); - memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); - memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); - memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); - memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); + bool pop_prev_dependence = insn[VTA_INSN_MEM_1]; + bool pop_next_dependence = insn[VTA_INSN_MEM_2]; + bool push_prev_dependence = insn[VTA_INSN_MEM_3]; + bool push_next_dependence = insn[VTA_INSN_MEM_4]; + memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0); + memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0); + memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0); + memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0); + memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0); + memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0); + memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0); + memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0); + memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0); + memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0); // Pop dependence token if instructed if (pop_prev_dependence) { - g2s_dep_queue.read(); + g2s_dep_queue->read(); } // Initialize indices @@ -556,18 +525,19 @@ void store ( // Skip padding along y dimension memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0; sram_idx += y_offset; -#pragma HLS RESOURCE variable=y_offset core=Mul_LUT +// Force this computation to be done with LUTs to avoid using too many DSPs +#pragma HLS RESOURCE variable = y_offset core = Mul_LUT // Copy along y dimension - for (int y = 0; y < y_size; y ++) { + for (int y = 0; y < y_size; y++) { #pragma HLS PIPELINE rewind // Skip padding along x dimension sram_idx += x_pad_0; // Perform data transfer memcpy( - (out_vec_T *) &outputs[dram_idx*BATCH], + const_cast(&outputs[dram_idx*VTA_BATCH]), (const out_vec_T*) &out_mem[sram_idx][0], - x_size * INP_ELEM_BYTES); + x_size * VTA_INP_ELEM_BYTES); sram_idx += x_size; dram_idx += x_stride; // Skip padding along x dimension @@ -576,11 +546,11 @@ void store ( // Push dependence token if instructed if (push_prev_dependence) { - s2g_dep_queue.write(1); + s2g_dep_queue->write(1); } } -void vta ( +void vta( uint32_t insn_count, volatile insn_T *insns, volatile uop_T *uops, @@ -588,14 +558,14 @@ void vta ( volatile wgt_vec_T *weights, volatile acc_vec_T *biases, volatile out_vec_T *outputs) { -#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS -#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port -#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port -#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port -#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port -#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port -#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port -#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS +#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS +#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port +#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port +#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port +#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port +#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port +#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port +#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS // Instantiate temporary instruction queues (used for peeking) hls::stream tmp_load_queue; @@ -614,18 +584,12 @@ void vta ( hls::stream g2s_dep_queue; // Instantiate memories - inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH]; - wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]; - out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]; + inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH]; + wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]; + out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]; // Push all instructions into the queues - fetch( - insn_count, - insns, - tmp_load_queue, - tmp_gemm_queue, - tmp_store_queue - ); + fetch(insn_count, insns, &tmp_load_queue, &tmp_gemm_queue, &tmp_store_queue); // Global done indicator uint32_t done = 0; @@ -651,21 +615,13 @@ void vta ( tmp_load_popped = true; } // Check dependences and invoke the load stage - bool pop_next_dependence = tmp_load[INSN_MEM_2]; + bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2]; if ((pop_next_dependence && !g2l_dep_queue.empty()) || !pop_next_dependence) { // Push the instruction in the load queue load_queue.write(tmp_load); tmp_load_popped = false; - load( - inputs, - weights, - load_queue, - g2l_dep_queue, - l2g_dep_queue, - inp_mem, - wgt_mem - ); + load(inputs, weights, &load_queue, &g2l_dep_queue, &l2g_dep_queue, inp_mem, wgt_mem); } else { // Execution of load stage pending on completion of other stages, so break here... break; @@ -679,8 +635,8 @@ void vta ( tmp_gemm_popped = true; } // Check dependences and invoke the load stage - bool pop_prev_dependence = tmp_gemv[INSN_MEM_1]; - bool pop_next_dependence = tmp_gemv[INSN_MEM_2]; + bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1]; + bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2]; if ( (pop_prev_dependence && !l2g_dep_queue.empty() && pop_next_dependence && !s2g_dep_queue.empty()) || @@ -693,19 +649,8 @@ void vta ( // Push the instruction in the load queue gemm_queue.write(tmp_gemv); tmp_gemm_popped = false; - compute( - done, - uops, - biases, - gemm_queue, - l2g_dep_queue, - s2g_dep_queue, - g2l_dep_queue, - g2s_dep_queue, - inp_mem, - wgt_mem, - out_mem - ); + compute(&done, uops, biases, &gemm_queue, &l2g_dep_queue, &s2g_dep_queue, + &g2l_dep_queue, &g2s_dep_queue, inp_mem, wgt_mem, out_mem); } else { // Execution of load stage pending on completion of other stages, // so break here... @@ -720,19 +665,13 @@ void vta ( tmp_store_popped = true; } // Check dependences and invoke the load stage - bool pop_prev_dependence = tmp_store[INSN_MEM_1]; + bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1]; if ((pop_prev_dependence && !g2s_dep_queue.empty()) || !pop_prev_dependence) { // Push the instruction in the load queue store_queue.write(tmp_store); tmp_store_popped = false; - store( - outputs, - store_queue, - g2s_dep_queue, - s2g_dep_queue, - out_mem - ); + store(outputs, &store_queue, &g2s_dep_queue, &s2g_dep_queue, out_mem); } else { // Execution of load stage pending on completion of other stages, so break here... break; @@ -742,7 +681,7 @@ void vta ( if (done) { break; } - exit_counter ++; + exit_counter++; if (exit_counter > 1000) { if (tmp_load_popped) { if (g2l_dep_queue.empty()) { @@ -750,10 +689,10 @@ void vta ( } } if (tmp_gemm_popped) { - if (l2g_dep_queue.empty() && tmp_gemv[INSN_MEM_1]) { + if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) { printf("waiting on l2g\n"); } - if (s2g_dep_queue.empty() && tmp_gemv[INSN_MEM_2]) { + if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) { printf("waiting on s2g\n"); } } @@ -772,17 +711,17 @@ void vta ( int s2g_count = 0; int g2l_count = 0; int g2s_count = 0; - while(l2g_dep_queue.read_nb(tmp_tok)) { - l2g_count ++; + while (l2g_dep_queue.read_nb(tmp_tok)) { + l2g_count++; } - while(s2g_dep_queue.read_nb(tmp_tok)) { - s2g_count ++; + while (s2g_dep_queue.read_nb(tmp_tok)) { + s2g_count++; } - while(g2l_dep_queue.read_nb(tmp_tok)) { - g2l_count ++; + while (g2l_dep_queue.read_nb(tmp_tok)) { + g2l_count++; } - while(g2s_dep_queue.read_nb(tmp_tok)) { - g2s_count ++; + while (g2s_dep_queue.read_nb(tmp_tok)) { + g2s_count++; } assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0); diff --git a/vta/hardware/vivado/src/vta.h b/vta/hardware/vivado/src/vta.h index 5dd4d953e436..37395722f5f7 100644 --- a/vta/hardware/vivado/src/vta.h +++ b/vta/hardware/vivado/src/vta.h @@ -3,96 +3,96 @@ * \file vta.h * \brief Type definitions and prototype for VTA HLS design. */ -#ifndef VTA_MAIN_H_ -#define VTA_MAIN_H_ +#ifndef VTA_VTA_H_ +#define VTA_VTA_H_ -#include #include #include +#include #include #include /* \typedef uop_T Micro-op datatype*/ -typedef ap_uint uop_T; +typedef ap_uint uop_T; /* \typedef inp_T Input datatype*/ -typedef ap_int inp_T; +typedef ap_int inp_T; /* \typedef wgt_T Weight datatype*/ -typedef ap_int wgt_T; +typedef ap_int wgt_T; /* \typedef out_T Output datatype*/ -typedef ap_int out_T; +typedef ap_int out_T; /* \typedef acc_T Accumulator datatype*/ -typedef ap_int acc_T; +typedef ap_int acc_T; /* \typedef mul_T Multiplier output datatype*/ -typedef ap_int mul_T; +typedef ap_int mul_T; /* \typedef sum_T GEMM accumulator datatype*/ -typedef ap_int sum_T; +typedef ap_int sum_T; /* \typedef inp_vec_T Input vector datatype*/ -typedef ap_uint inp_vec_T; +typedef ap_uint inp_vec_T; /* \typedef wgt_vec_T Weight vector datatype*/ -typedef ap_uint wgt_vec_T; +typedef ap_uint wgt_vec_T; /* \typedef acc_vec_T Accumulator vector datatype*/ -typedef ap_uint acc_vec_T; +typedef ap_uint acc_vec_T; /* \typedef out_vec_T Output vector datatype*/ -typedef ap_uint out_vec_T; +typedef ap_uint out_vec_T; /* \typedef uop_idx_T Micro-op SRAM index datatype*/ -typedef ap_uint uop_idx_T; +typedef ap_uint uop_idx_T; /* \typedef inp_idx_T Input SRAM index datatype*/ -typedef ap_uint inp_idx_T; +typedef ap_uint inp_idx_T; /* \typedef wgt_idx_T Weight SRAM index datatype*/ -typedef ap_uint wgt_idx_T; +typedef ap_uint wgt_idx_T; /* \typedef acc_idx_T Accumulator SRAM index datatype*/ -typedef ap_uint acc_idx_T; +typedef ap_uint acc_idx_T; /* \typedef opcode_T Opcode datatype*/ -typedef ap_uint opcode_T; +typedef ap_uint opcode_T; /* \typedef insn_T Instruction datatype*/ -typedef ap_uint insn_T; +typedef ap_uint insn_T; /* \typedef loop_T Loop bound datatype*/ -typedef ap_uint loop_T; +typedef ap_uint loop_T; /* \typedef memop_id_T Memory operation ID datatype*/ -typedef ap_uint memop_id_T; +typedef ap_uint memop_id_T; /* \typedef memop_sram_T Memory operation SRAM index datatype*/ -typedef ap_uint memop_sram_T; +typedef ap_uint memop_sram_T; /* \typedef memop_dram_T Memory operation DRAM index datatype*/ -typedef ap_uint memop_dram_T; +typedef ap_uint memop_dram_T; /* \typedef memop_size_T Memory operation range datatype*/ -typedef ap_uint memop_size_T; +typedef ap_uint memop_size_T; /* \typedef memop_stride_T Memory operation stride datatype*/ -typedef ap_uint memop_stride_T; +typedef ap_uint memop_stride_T; /* \typedef memop_pad_T Memory operation pad width datatype*/ -typedef ap_uint memop_pad_T; +typedef ap_uint memop_pad_T; /* \typedef aluop_opcode_T ALU operation opcode datatype*/ -typedef ap_uint aluop_opcode_T; +typedef ap_uint aluop_opcode_T; /* \typedef aluop_opcode_T ALU operation immediate datatype*/ -typedef ap_int aluop_imm_T; +typedef ap_int aluop_imm_T; /* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ -typedef ap_uint aluop_sh_imm_T; +typedef ap_uint aluop_sh_imm_T; /*! * \brief Fetch module. @@ -104,12 +104,12 @@ typedef ap_uint aluop_sh_imm_T; * \param gemm_queue GEMM instruction queue. AXI-stream FIFO. * \param store_queue Store instruction queue. AXI-stream FIFO. */ -void fetch ( +void fetch( uint32_t insn_count, volatile insn_T *insns, - hls::stream &load_queue, - hls::stream &gemm_queue, - hls::stream &store_queue); + hls::stream *load_queue, + hls::stream *gemm_queue, + hls::stream *store_queue); /*! * \brief Load module. @@ -126,15 +126,14 @@ void fetch ( * \param inp_mem Local input SRAM buffer. Write only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM. */ -void load ( +void load( volatile inp_vec_T *inputs, volatile wgt_vec_T *weights, - hls::stream &load_queue, - hls::stream &g2l_dep_queue, - hls::stream &l2g_dep_queue, - inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], - wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT] - ); + hls::stream *load_queue, + hls::stream *g2l_dep_queue, + hls::stream *l2g_dep_queue, + inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], + wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]); /*! * \brief Compute module. @@ -159,19 +158,18 @@ void load ( * \param wgt_mem Local weight SRAM buffer. Read only single port BRAM. * \param out_mem Local output SRAM buffer. Write only single port BRAM. */ -void compute ( - volatile uint32_t &done, +void compute( + volatile uint32_t *done, volatile uop_T *uops, volatile acc_vec_T *biases, - hls::stream &gemm_queue, - hls::stream &l2g_dep_queue, - hls::stream &s2g_dep_queue, - hls::stream &g2l_dep_queue, - hls::stream &g2s_dep_queue, - out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], - wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT], - out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] - ); + hls::stream *gemm_queue, + hls::stream *l2g_dep_queue, + hls::stream *s2g_dep_queue, + hls::stream *g2l_dep_queue, + hls::stream *g2s_dep_queue, + out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH], + wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT], + out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]); /*! * \brief Store module. @@ -186,13 +184,12 @@ void compute ( * AXI-stream FIFO. * \param out_mem Local output SRAM buffer. Read only single port BRAM. */ -void store ( +void store( volatile out_vec_T *outputs, - hls::stream &store_queue, - hls::stream &g2s_dep_queue, - hls::stream &s2g_dep_queue, - out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] - ); + hls::stream *store_queue, + hls::stream *g2s_dep_queue, + hls::stream *s2g_dep_queue, + out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]); /*! * \brief VTA wrapper for simulation purpose only. @@ -205,7 +202,7 @@ void store ( * \param biases Bias data base address in DRAM. AXI-4 master port. * \param outputs Output data base address in DRAM. AXI-4 master port. */ -void vta ( +void vta( uint32_t insn_count, volatile insn_T *insns, volatile uop_T *uops, @@ -214,4 +211,4 @@ void vta ( volatile acc_vec_T *biases, volatile out_vec_T *outputs); -#endif // VTA_MAIN_H_ \ No newline at end of file +#endif // VTA_VTA_H_ diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h index 2b5e0ea93674..c93021d96e4b 100644 --- a/vta/include/vta/driver.h +++ b/vta/include/vta/driver.h @@ -14,10 +14,10 @@ extern "C" { #include #include -/*! \brief Memory management constants with libxlnk_cma */ -#define CACHED 1 -/*! \brief Memory management constants with libxlnk_cma */ -#define NOT_CACHED 0 +/*! \brief Memory management constants */ +#define VTA_CACHED 1 +/*! \brief Memory management constants */ +#define VTA_NOT_CACHED 0 /*! \brief VTA command handle */ typedef void * VTAHandle; @@ -97,4 +97,4 @@ void VTAProgram(const char* bitstream); #ifdef __cplusplus } #endif -#endif // VTA_DRIVER_H_ +#endif // VTA_DRIVER_H_ diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h index b18e94e63a07..0c30b344795a 100644 --- a/vta/include/vta/hw_spec.h +++ b/vta/include/vta/hw_spec.h @@ -14,150 +14,153 @@ extern "C" { #include /*! log2 of instruction data type width */ -#define LOG_INS_WIDTH 7 +#define VTA_LOG_INS_WIDTH 7 /*! Instruction data type width */ -#define INS_WIDTH (1< 8 bits) -LOG_INP_WIDTH = 3 +VTA_LOG_INP_WIDTH = 3 # Log of kernel weight width in bits (default 3 -> 8 bits) -LOG_WGT_WIDTH = 3 +VTA_LOG_WGT_WIDTH = 3 # Log of accum width in bits (default 5 -> 32 bits) -LOG_ACC_WIDTH = 5 +VTA_LOG_ACC_WIDTH = 5 # Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) -LOG_BATCH = 0 +VTA_LOG_BATCH = 0 # Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) -LOG_BLOCK_IN = 4 +VTA_LOG_BLOCK_IN = 4 # Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) -LOG_BLOCK_OUT = 4 +VTA_LOG_BLOCK_OUT = 4 # Log of uop buffer size in Bytes -LOG_UOP_BUFF_SIZE = 15 +VTA_LOG_UOP_BUFF_SIZE = 15 # Log of inp buffer size in Bytes -LOG_INP_BUFF_SIZE = 15 +VTA_LOG_INP_BUFF_SIZE = 15 # Log of wgt buffer size in Bytes -LOG_WGT_BUFF_SIZE = 15 +VTA_LOG_WGT_BUFF_SIZE = 15 # Log of acc buffer size in Bytes -LOG_ACC_BUFF_SIZE = 17 +VTA_LOG_ACC_BUFF_SIZE = 17 #--------------------- # Derived VTA hardware parameters #-------------------- # Input width in bits -INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" ) +VTA_INP_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_INP_WIDTH) ))" ) # Weight width in bits -WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" ) +VTA_WGT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_WIDTH) ))" ) # Log of output width in bits -LOG_OUT_WIDTH = $(LOG_INP_WIDTH) +VTA_LOG_OUT_WIDTH = $(VTA_LOG_INP_WIDTH) # Output width in bits -OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" ) +VTA_OUT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_WIDTH) ))" ) # Tensor batch size -BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" ) +VTA_BATCH = $(shell echo "$$(( 1 << $(VTA_LOG_BATCH) ))" ) # Tensor outer block size -IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" ) +VTA_IN_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_IN) ))" ) # Tensor inner block size -OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" ) +VTA_OUT_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_OUT) ))" ) # Uop buffer size in Bytes -UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" ) +VTA_UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_UOP_BUFF_SIZE) ))" ) # Inp buffer size in Bytes -INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" ) +VTA_INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_INP_BUFF_SIZE) ))" ) # Wgt buffer size in Bytes -WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" ) +VTA_WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_BUFF_SIZE) ))" ) # Acc buffer size in Bytes -ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" ) +VTA_ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_ACC_BUFF_SIZE) ))" ) # Log of out buffer size in Bytes -LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) +VTA_LOG_OUT_BUFF_SIZE = \ +$(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_ACC_WIDTH) ))" ) # Out buffer size in Bytes -OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) +VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) # Update ADD_CFLAGS ADD_CFLAGS += \ -D$(TARGET) \ - -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \ - -DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \ - -DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \ - -DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \ - -DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \ - -DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE) \ No newline at end of file + -DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \ + -DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \ + -DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \ + -DVTA_LOG_BLOCK_IN=$(VTA_LOG_BLOCK_IN) -DVTA_LOG_BLOCK_OUT=$(VTA_LOG_BLOCK_OUT) \ + -DVTA_LOG_UOP_BUFF_SIZE=$(VTA_LOG_UOP_BUFF_SIZE) -DVTA_LOG_INP_BUFF_SIZE=$(VTA_LOG_INP_BUFF_SIZE) \ + -DVTA_LOG_WGT_BUFF_SIZE=$(VTA_LOG_WGT_BUFF_SIZE) -DVTA_LOG_ACC_BUFF_SIZE=$(VTA_LOG_ACC_BUFF_SIZE) \ + -DVTA_LOG_OUT_BUFF_SIZE=$(VTA_LOG_OUT_BUFF_SIZE) diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc index b4f78db0c160..1787af8da526 100644 --- a/vta/src/pynq/pynq_driver.cc +++ b/vta/src/pynq/pynq_driver.cc @@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) { } void *VTAMapRegister(uint32_t addr, size_t length) { - // Align the base address with the pages uint32_t virt_base = addr & ~(getpagesize() - 1); // Calculate base address offset w.r.t the base address uint32_t virt_offset = addr - virt_base; // Open file and mmap - uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC); - - return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base); + uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC); + return mmap(NULL, + (length+virt_offset), + PROT_READ|PROT_WRITE, + MAP_SHARED, + mmap_file, + virt_base); } void VTAUnmapRegister(void *vta, size_t length) { // Unmap memory int status = munmap(vta, length); - assert(status==0); + assert(status == 0); } void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { - *((volatile uint32_t *) (((char *) base_addr) + offset)) = val; + *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)) = val; } uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { - return *((volatile uint32_t *) (((char *) base_addr) + offset)); + return *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)); } void VTAProgram(const char* bitstream) { - int elem; FILE *src, *dst, *partial; - - partial = fopen(BS_IS_PARTIAL, "w"); + partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w"); if (partial == NULL) { - printf("Cannot open partial config file %s\n", BS_IS_PARTIAL); + printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL); fclose(partial); exit(1); } fputc('0', partial); fclose(partial); - src = fopen(bitstream, "rb"); if (src == NULL) { printf("Cannot open bitstream %s\n", bitstream); exit(1); } - - dst = fopen(BS_XDEVCFG, "wb"); + dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb"); if (dst == NULL) { - printf("Cannot open device file %s\n", BS_XDEVCFG); + printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG); fclose(dst); exit(1); } - elem = fgetc(src); while (elem != EOF) { fputc(elem, dst); elem = fgetc(src); } - fclose(src); fclose(dst); - -} \ No newline at end of file +} diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h index 9e9482822a21..952c4cff8c59 100644 --- a/vta/src/pynq/pynq_driver.h +++ b/vta/src/pynq/pynq_driver.h @@ -4,8 +4,8 @@ * \brief VTA driver for Pynq board. */ -#ifndef VTA_PYNQ_DRIVER_H_ -#define VTA_PYNQ_DRIVER_H_ +#ifndef VTA_PYNQ_PYNQ_DRIVER_H_ +#define VTA_PYNQ_PYNQ_DRIVER_H_ #ifdef __cplusplus extern "C" { @@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size); void xlnkInvalidateCache(void* buf, int size); #endif -/*! \brief partial bitstream status file path */ -#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" -/*! \brief bitstream destination file path */ -#define BS_XDEVCFG "/dev/xdevcfg" +/*! \brief (Pynq only) Partial bitstream status file path */ +#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" +/*! \brief (Pynq only) Bitstream destination file path */ +#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg" -/*! \brief Path to /dev/mem */ -#define DEV_MEM_PATH "/dev/mem" -/*! \brief MMIO driver constant */ -#define MMIO_WORD_LENGTH 4 -/*! \brief MMIO driver constant */ -#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) +/*! \brief (Pynq only) Path to /dev/mem */ +#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem" +/*! \brief (Pynq only) MMIO driver constant */ +#define VTA_PYNQ_MMIO_WORD_LENGTH 4 +/*! \brief (Pynq only) MMIO driver constant */ +#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) + +/*! \brief Physically contiguous buffer size limit */ +#define VTA_MAX_XFER (1<<22) /*! \brief VTA configuration register address range */ #define VTA_RANGE 0x100 @@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size); */ #define VTA_STORE_ADDR 0x43C30000 -/*! \brief Buffer size limit */ -#define MAX_XFER (1<<22) - #ifdef __cplusplus } #endif -#endif // VTA_PYNQ_DRIVER_H_ \ No newline at end of file +#endif // VTA_PYNQ_PYNQ_DRIVER_H_ \ No newline at end of file diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc index 570816e5236a..dde88e8cc829 100644 --- a/vta/src/runtime.cc +++ b/vta/src/runtime.cc @@ -4,19 +4,20 @@ * \brief VTA runtime for PYNQ in C++11 */ +#ifdef VTA_PYNQ_TARGET +#include "./pynq/pynq_driver.h" +#endif // VTA_PYNQ_TARGET + +#include +#include +#include + #include #include #include #include #include #include -#include -#include -#include - -#ifdef PYNQ_TARGET -#include "./pynq/pynq_driver.h" -#endif //PYNQ_TARGET namespace vta { @@ -193,21 +194,21 @@ class UopKernel { op.wgt_idx = wgt_index; seq_.push_back(op); // Ensure that mode is consistent if set - if (mode_==0xFFFFFFFF) { + if (mode_ == 0xFFFFFFFF) { mode_ = mode; } else { - assert(mode_==mode); + assert(mode_ == mode); } // Check kernel op and imm/imm_val in ALU mode - if (mode==1) { - if (opcode_==0xFFFFFFFF) { - opcode_=opcode; - use_imm_=use_imm; - imm_val_=imm_val; + if (mode == 1) { + if (opcode_ == 0xFFFFFFFF) { + opcode_ = opcode; + use_imm_ = use_imm; + imm_val_ = imm_val; } else { - assert(opcode_==opcode); - assert(use_imm_==use_imm); - assert(imm_val_==imm_val); + assert(opcode_ == opcode); + assert(use_imm_ == use_imm); + assert(imm_val_ == imm_val); } } } @@ -222,17 +223,17 @@ class UopKernel { seq_[i].src_idx, seq_[i].wgt_idx, seq_[i].reset_out); - } printf("\n"); } public: // The kernel's mode, opcode, immediate setting and value - uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU + uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU uint32_t opcode_{0xFFFFFFFF}; bool use_imm_{false}; uint16_t imm_val_{0}; + private: // Verify that we don't write to the same acc_mem index two cycles in a row void VerifyDep(uint32_t dst_index) { @@ -375,7 +376,7 @@ class UopQueue : public BaseQueue { } // Simple eviction policy uint32_t evict_begin = cache_ptr_; - for (;cache_ptr_ < cache_.size(); ++cache_ptr_) { + for (; cache_ptr_ < cache_.size(); ++cache_ptr_) { if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break; cache_[cache_ptr_]->sram_begin_ = 0; cache_[cache_ptr_]->sram_end_ = 0; @@ -395,7 +396,7 @@ class UopQueue : public BaseQueue { void FlushUopLoad(VTAMemInsn* insn) { if (sram_begin_ != sram_end_) { assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_)); - insn->memory_type = MEM_ID_UOP; + insn->memory_type = VTA_MEM_ID_UOP; insn->sram_base = sram_begin_; insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_; insn->y_size = 1; @@ -418,7 +419,7 @@ class UopQueue : public BaseQueue { std::vector cache_; // Constants static constexpr int kElemBytes = sizeof(VTAUop); - static constexpr int kMaxNumUop = UOP_BUFF_DEPTH; + static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH; static constexpr int kMaxElems = kMaxBytes / kElemBytes; }; @@ -541,22 +542,22 @@ class InsnQueue : public BaseQueue { for (int i = 1; i < insn_count; ++i) { PipelineStage prev = GetPipelineStage(mem_ptr + i - 1); PipelineStage now = GetPipelineStage(mem_ptr + i); - if (prev==kLoadStage && now==kComputeStage) { + if (prev == kLoadStage && now == kComputeStage) { mem_ptr[i - 1].push_prev_dep = false; mem_ptr[i - 1].push_next_dep = true; mem_ptr[i].pop_prev_dep = true; mem_ptr[i].pop_next_dep = false; - } else if (prev==kComputeStage && now==kLoadStage) { + } else if (prev == kComputeStage && now == kLoadStage) { mem_ptr[i - 1].push_prev_dep = true; mem_ptr[i - 1].push_next_dep = false; mem_ptr[i].pop_prev_dep = false; mem_ptr[i].pop_next_dep = true; - } else if (prev==kStoreStage && now==kComputeStage) { + } else if (prev == kStoreStage && now == kComputeStage) { mem_ptr[i - 1].push_prev_dep = true; mem_ptr[i - 1].push_next_dep = false; mem_ptr[i].pop_prev_dep = false; mem_ptr[i].pop_next_dep = true; - } else if (prev==kComputeStage && now==kStoreStage) { + } else if (prev == kComputeStage && now == kStoreStage) { mem_ptr[i - 1].push_prev_dep = false; mem_ptr[i - 1].push_next_dep = true; mem_ptr[i].pop_prev_dep = true; @@ -573,39 +574,39 @@ class InsnQueue : public BaseQueue { // Helper function: Get Opcode string const char* getOpcodeString(int opcode, bool use_imm) { // The string name - if (opcode==ALU_OPCODE_MIN) { + if (opcode == VTA_ALU_OPCODE_MIN) { if (use_imm) { return "min imm"; } else { return "min"; } - } else if (opcode==ALU_OPCODE_MAX) { + } else if (opcode == VTA_ALU_OPCODE_MAX) { if (use_imm) { return "max imm"; } else { return "max"; } - } else if (opcode==ALU_OPCODE_ADD) { + } else if (opcode == VTA_ALU_OPCODE_ADD) { if (use_imm) { return "add imm"; } else { return "add"; } - } else if (opcode==ALU_OPCODE_SUB) { + } else if (opcode == VTA_ALU_OPCODE_SUB) { if (use_imm) { return "sub imm"; } else { return "sub"; } - } else if (opcode==ALU_OPCODE_MUL) { + } else if (opcode == VTA_ALU_OPCODE_MUL) { if (use_imm) { return "mul imm"; } else { return "mul"; } - } else if (opcode==ALU_OPCODE_SHL) { + } else if (opcode == VTA_ALU_OPCODE_SHL) { return "shl"; - } else if (opcode==ALU_OPCODE_SHR) { + } else if (opcode == VTA_ALU_OPCODE_SHR) { return "shr"; } @@ -629,12 +630,11 @@ class InsnQueue : public BaseQueue { // Fetch instruction and decode opcode c.generic = insn[i]; printf("INSTRUCTION %u: ", i); - if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { if (c.mem.x_size == 0) { - if (c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_STORE) { printf("NOP-STORE-STAGE\n"); - } - else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { + } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { printf("NOP-COMPUTE-STAGE\n"); } else { printf("NOP-MEMORY-STAGE\n"); @@ -645,15 +645,15 @@ class InsnQueue : public BaseQueue { static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); // Count status in queues - if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { - if (c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_STORE) { assert(c.mem.pop_next_dep == false); assert(c.mem.push_next_dep == false); if (c.mem.pop_prev_dep) g2s_queue--; if (c.mem.push_prev_dep) s2g_queue++; - } else if (c.mem.opcode == OPCODE_LOAD && - (c.mem.memory_type == MEM_ID_INP || - c.mem.memory_type == MEM_ID_WGT) ) { + } else if (c.mem.opcode == VTA_OPCODE_LOAD && + (c.mem.memory_type == VTA_MEM_ID_INP || + c.mem.memory_type == VTA_MEM_ID_WGT) ) { assert(c.mem.pop_prev_dep == false); assert(c.mem.push_prev_dep == false); if (c.mem.pop_next_dep) g2l_queue--; @@ -664,7 +664,7 @@ class InsnQueue : public BaseQueue { if (c.mem.pop_next_dep) s2g_queue--; if (c.mem.push_next_dep) g2s_queue++; } - } else if (c.mem.opcode == OPCODE_GEMM) { + } else if (c.mem.opcode == VTA_OPCODE_GEMM) { // Print instruction field information if (c.gemm.pop_prev_dep) l2g_queue--; if (c.gemm.push_prev_dep) g2l_queue++; @@ -676,14 +676,14 @@ class InsnQueue : public BaseQueue { continue; } // Print instruction field information - if (c.mem.opcode==OPCODE_LOAD) { + if (c.mem.opcode == VTA_OPCODE_LOAD) { printf("LOAD "); - if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n"); - if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n"); - if (c.mem.memory_type == MEM_ID_INP) printf("INP\n"); - if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n"); + if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n"); + if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n"); + if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n"); + if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n"); } - if (c.mem.opcode==OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_STORE) { printf("STORE\n"); } printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", @@ -703,7 +703,7 @@ class InsnQueue : public BaseQueue { static_cast(c.mem.x_stride), static_cast(c.mem.x_pad_0), static_cast(c.mem.x_pad_1)); - } else if (c.mem.opcode==OPCODE_GEMM) { + } else if (c.mem.opcode == VTA_OPCODE_GEMM) { // Print instruction field information printf("GEMM\n"); @@ -725,7 +725,7 @@ class InsnQueue : public BaseQueue { static_cast(c.gemm.wgt_factor_in), static_cast(c.gemm.src_factor_in), static_cast(c.gemm.dst_factor_in)); - } else if (c.mem.opcode == OPCODE_ALU) { + } else if (c.mem.opcode == VTA_OPCODE_ALU) { // Print instruction field information printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", @@ -744,20 +744,20 @@ class InsnQueue : public BaseQueue { static_cast(c.alu.iter_in), static_cast(c.alu.dst_factor_in), static_cast(c.alu.src_factor_in)); - } else if (c.mem.opcode == OPCODE_FINISH) { + } else if (c.mem.opcode == VTA_OPCODE_FINISH) { printf("FINISH\n"); } // Count status in queues - if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { - if (c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_STORE) { assert(c.mem.pop_next_dep == false); assert(c.mem.push_next_dep == false); if (c.mem.pop_prev_dep) g2s_queue--; if (c.mem.push_prev_dep) s2g_queue++; - } else if (c.mem.opcode == OPCODE_LOAD && - (c.mem.memory_type == MEM_ID_INP || - c.mem.memory_type == MEM_ID_WGT) ) { + } else if (c.mem.opcode == VTA_OPCODE_LOAD && + (c.mem.memory_type == VTA_MEM_ID_INP || + c.mem.memory_type == VTA_MEM_ID_WGT) ) { assert(c.mem.pop_prev_dep == false); assert(c.mem.push_prev_dep == false); if (c.mem.pop_next_dep) g2l_queue--; @@ -768,8 +768,8 @@ class InsnQueue : public BaseQueue { if (c.mem.pop_next_dep) s2g_queue--; if (c.mem.push_next_dep) g2s_queue++; } - } else if (c.mem.opcode == OPCODE_GEMM || - c.mem.opcode == OPCODE_ALU) { + } else if (c.mem.opcode == VTA_OPCODE_GEMM || + c.mem.opcode == VTA_OPCODE_ALU) { // Print instruction field information if (c.gemm.pop_prev_dep) l2g_queue--; if (c.gemm.push_prev_dep) g2l_queue++; @@ -832,23 +832,24 @@ class InsnQueue : public BaseQueue { } // Get stage of the memory static PipelineStage GetMemPipelineStage(int memory_type) { - if (memory_type == MEM_ID_ACC) return kComputeStage; - if (memory_type == MEM_ID_UOP) return kComputeStage; + if (memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } // Get stage of the computation static PipelineStage GetPipelineStage(VTAMemInsn* insn) { - if (insn->opcode == OPCODE_GEMM) return kComputeStage; - if (insn->opcode == OPCODE_ALU) return kComputeStage; - if (insn->opcode == OPCODE_LOAD) { + if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage; + if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage; + if (insn->opcode == VTA_OPCODE_LOAD) { if (insn->x_size == 0) return kNoneStage; - if (insn->memory_type == MEM_ID_ACC) return kComputeStage; - if (insn->memory_type == MEM_ID_UOP) return kComputeStage; + if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } - if (insn->opcode == OPCODE_STORE) { - // FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0 - // For now we'll refrain from checking the memory_type to avoid an assertion error... + if (insn->opcode == VTA_OPCODE_STORE) { + // FIXME: Right now memory_type is a 2-bit field which means that + // VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from + // checking the memory_type to avoid an assertion error... return kStoreStage; } assert(false); @@ -859,7 +860,7 @@ class InsnQueue : public BaseQueue { bool push_prev_dep, bool push_next_dep, bool pop_prev_dep, bool pop_next_dep) { VTAMemInsn* insn = reinterpret_cast(NextInsn()); - insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD); + insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD); insn->push_prev_dep = push_prev_dep; insn->push_next_dep = push_next_dep; insn->pop_prev_dep = pop_prev_dep; @@ -873,7 +874,7 @@ class InsnQueue : public BaseQueue { insn->y_pad_1 = 0; insn->x_pad_0 = 0; insn->x_pad_1 = 0; - insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP); + insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP); } private: @@ -913,12 +914,12 @@ class CommandQueue { } uint32_t GetElemBytes(uint32_t memory_id) { - switch (memory_id){ - case MEM_ID_UOP: return UOP_ELEM_BYTES; - case MEM_ID_INP: return INP_ELEM_BYTES; - case MEM_ID_WGT: return WGT_ELEM_BYTES; - case MEM_ID_ACC: return ACC_ELEM_BYTES; - case MEM_ID_OUT: return INP_ELEM_BYTES; + switch (memory_id) { + case VTA_MEM_ID_UOP: return VTA_UOP_ELEM_BYTES; + case VTA_MEM_ID_INP: return VTA_INP_ELEM_BYTES; + case VTA_MEM_ID_WGT: return VTA_WGT_ELEM_BYTES; + case VTA_MEM_ID_ACC: return VTA_ACC_ELEM_BYTES; + case VTA_MEM_ID_OUT: return VTA_INP_ELEM_BYTES; default: break; } printf("Memory id not recognized: %d\n", memory_id); @@ -938,7 +939,7 @@ class CommandQueue { uint32_t dst_sram_index, uint32_t dst_memory_type) { VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type); - insn->opcode = OPCODE_LOAD; + insn->opcode = VTA_OPCODE_LOAD; insn->memory_type = dst_memory_type; insn->sram_base = dst_sram_index; DataBuffer* src = DataBuffer::FromHandle(src_dram_addr); @@ -961,7 +962,7 @@ class CommandQueue { uint32_t y_size, uint32_t x_stride) { VTAMemInsn* insn = insn_queue_.CreateStoreInsn(); - insn->opcode = OPCODE_STORE; + insn->opcode = VTA_OPCODE_STORE; insn->memory_type = src_memory_type; insn->sram_base = src_sram_index; DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr); @@ -1013,7 +1014,7 @@ class CommandQueue { insn_queue_.CommitPendingPop(kComputeStage); // NOTE: FINISH cannot contain pop VTAGemInsn* insn = insn_queue_.CreateGemInsn(); - insn->opcode = OPCODE_FINISH; + insn->opcode = VTA_OPCODE_FINISH; assert(!insn_queue_.PendingPop()); // Check if there are no instruction to execute at all if (insn_queue_.count() == 0) return; @@ -1026,11 +1027,11 @@ class CommandQueue { } // Make sure that the last instruction is a finish instruction assert(reinterpret_cast( - insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH); + insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH); -#ifdef PYNQ_TARGET +#ifdef VTA_PYNQ_TARGET // Make sure that we don't exceed contiguous physical memory limits - assert(insn_queue_.count() < MAX_XFER); + assert(insn_queue_.count() < VTA_MAX_XFER); // NOTE: Register address map is derived from the auto-generated // driver files available under hardware/build/vivado//export/driver @@ -1064,7 +1065,7 @@ class CommandQueue { } // Report error if timeout assert(t < wait_cycles); -#endif //PYNQ_TARGET +#endif // VTA_PYNQ_TARGET // Reset buffers uop_queue_.Reset(); @@ -1142,12 +1143,12 @@ class CommandQueue { uop_queue_.Push(kernel, [this]() { this->AutoSync(); }); if (uop_queue_.pending()) { - VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); - insn->opcode = OPCODE_LOAD; + VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP); + insn->opcode = VTA_OPCODE_LOAD; uop_queue_.FlushUopLoad(insn); } VTAGemInsn* insn = insn_queue_.CreateGemInsn(); - insn->opcode = OPCODE_GEMM; + insn->opcode = VTA_OPCODE_GEMM; insn->uop_bgn = kernel->sram_begin_; insn->uop_end = kernel->sram_end_; const std::vector &loop = kernel->loop(); @@ -1180,12 +1181,12 @@ class CommandQueue { uop_queue_.Push(kernel, [this]() { this->AutoSync(); }); if (uop_queue_.pending()) { - VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); - insn->opcode = OPCODE_LOAD; + VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP); + insn->opcode = VTA_OPCODE_LOAD; uop_queue_.FlushUopLoad(insn); } VTAAluInsn* insn = insn_queue_.CreateAluInsn(); - insn->opcode = OPCODE_ALU; + insn->opcode = VTA_OPCODE_ALU; insn->uop_bgn = kernel->sram_begin_; insn->uop_end = kernel->sram_end_; insn->alu_opcode = kernel->opcode_; @@ -1219,7 +1220,7 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: // one pending store, one pending load, and one uop - if (insn_queue_.count() >= MAX_XFER) { + if (insn_queue_.count() >= VTA_MAX_XFER) { this->AutoSync(); } } @@ -1237,9 +1238,9 @@ class CommandQueue { // The kernel we currently recording UopKernel* record_kernel_{nullptr}; // Micro op queue - UopQueue uop_queue_; + UopQueue uop_queue_; // instruction queue - InsnQueue insn_queue_; + InsnQueue insn_queue_; }; } // namespace vta @@ -1342,10 +1343,10 @@ void VTAStoreBuffer2D(VTACommandHandle cmd, uint32_t x_size, uint32_t y_size, uint32_t x_stride) { - static_cast(cmd)-> - StoreBuffer2D(src_sram_index, src_memory_type, - dst_dram_addr, dst_elem_offset, - x_size, y_size, x_stride); + static_cast(cmd)-> + StoreBuffer2D(src_sram_index, src_memory_type, + dst_dram_addr, dst_elem_offset, + x_size, y_size, x_stride); } void VTAUopPush(uint32_t mode, diff --git a/vta/src/tvm/vta_device_api.cc b/vta/src/tvm/vta_device_api.cc index b686b65fc415..b7b57e199f3f 100644 --- a/vta/src/tvm/vta_device_api.cc +++ b/vta/src/tvm/vta_device_api.cc @@ -1,8 +1,14 @@ -// simply include the driver for now. +/*! + * Copyright (c) 2018 by Contributors + * \file vta_device_api.cc + * \brief VTA device API for TVM + */ + #include #include #include -#include "../../tvm/src/runtime/workspace_pool.h" + +#include "../../nnvm/tvm/src/runtime/workspace_pool.h" namespace tvm { namespace runtime { diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc index d203b2aa1307..7f46a43b1867 100644 --- a/vta/tests/hardware/common/test_lib.cc +++ b/vta/tests/hardware/common/test_lib.cc @@ -6,41 +6,43 @@ #include "./test_lib.h" +uint32_t globalSeed; + const char* getOpcodeString(int opcode, bool use_imm) { // Returns string name - if (opcode == ALU_OPCODE_MIN) { + if (opcode == VTA_ALU_OPCODE_MIN) { if (use_imm) { return "min imm"; } else { return "min"; } - } else if (opcode == ALU_OPCODE_MAX) { + } else if (opcode == VTA_ALU_OPCODE_MAX) { if (use_imm) { return "max imm"; } else { return "max"; } - } else if (opcode == ALU_OPCODE_ADD) { + } else if (opcode == VTA_ALU_OPCODE_ADD) { if (use_imm) { return "add imm"; } else { return "add"; } - } else if (opcode == ALU_OPCODE_SUB) { + } else if (opcode == VTA_ALU_OPCODE_SUB) { if (use_imm) { return "sub imm"; } else { return "sub"; } - } else if (opcode == ALU_OPCODE_MUL) { + } else if (opcode == VTA_ALU_OPCODE_MUL) { if (use_imm) { return "mul imm"; } else { return "mul"; } - } else if (opcode == ALU_OPCODE_SHL) { + } else if (opcode == VTA_ALU_OPCODE_SHL) { return "shl"; - } else if (opcode == ALU_OPCODE_SHR) { + } else if (opcode == VTA_ALU_OPCODE_SHR) { return "shr"; } return "unknown op"; @@ -49,20 +51,20 @@ const char* getOpcodeString(int opcode, bool use_imm) { template void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) { int buffer_idx = 0; - for(int i = 0; i < y_size / y_block; i ++) { - for(int j = 0; j < x_size / x_block; j ++) { - for(int k = 0; k < y_block; k ++) { + for (int i = 0; i < y_size / y_block; i++) { + for (int j = 0; j < x_size / x_block; j++) { + for (int k = 0; k < y_block; k++) { if (T_WIDTH < 8) { for (int l = 0; l < x_block; l += 8 / T_WIDTH) { dst[buffer_idx] = 0; - for (int m = 0; m < 8 / T_WIDTH; m ++) { + for (int m = 0; m < 8 / T_WIDTH; m++) { dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] & ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH); } - buffer_idx ++; + buffer_idx++; } } else { - for (int l = 0; l < x_block; l ++) { + for (int l = 0; l < x_block; l++) { dst[buffer_idx++] = src[i * y_block + k][j * x_block + l]; } } @@ -74,20 +76,20 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc template void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) { int buffer_idx = 0; - for(int i = 0; i < y_size / y_block; i ++) { - for(int j = 0; j < x_size / x_block; j ++) { - for(int k = 0; k < y_block; k ++) { + for (int i = 0; i < y_size / y_block; i++) { + for (int j = 0; j < x_size / x_block; j++) { + for (int k = 0; k < y_block; k++) { if (T_WIDTH < 8) { for (int l = 0; l < x_block; l += 8 / T_WIDTH) { - for (int m = 0; m < 8 / T_WIDTH; m ++) { + for (int m = 0; m < 8 / T_WIDTH; m++) { dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH)) & ((1 << T_WIDTH) - 1); } - buffer_idx ++; + buffer_idx++; } } else { - for (int l = 0; l < x_block; l ++) { - dst[i * y_block + k][j * x_block + l] = src[buffer_idx ++]; + for (int l = 0; l < x_block; l++) { + dst[i * y_block + k][j * x_block + l] = src[buffer_idx++]; } } } @@ -98,14 +100,15 @@ void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_bl template T ** allocInit2dArray(int rows, int cols) { // Allocate - T **array = (T **) malloc(sizeof(T *) * rows); - for (int i = 0; i < rows; i ++) { - array[i] = (T *) malloc(sizeof(T) * cols); + T **array = static_cast(malloc(sizeof(T *) * rows)); + for (int i = 0; i < rows; i++) { + array[i] = static_cast(malloc(sizeof(T) * cols)); } // Init - for (int i = 0; i < rows; i ++) { - for (int j = 0; j < cols; j ++) { - array[i][j] = (T) (rand() % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2))); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + array[i][j] = + static_cast(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2))); } } return array; @@ -113,16 +116,16 @@ T ** allocInit2dArray(int rows, int cols) { template T ** alloc2dArray(int rows, int cols) { - T **array = (T **) malloc(sizeof(T *) * rows); - for (int i = 0; i < rows; i ++) { - array[i] = (T *) malloc(sizeof(T) * cols); + T **array = static_cast(malloc(sizeof(T *) * rows)); + for (int i = 0; i < rows; i++) { + array[i] = static_cast(malloc(sizeof(T) * cols)); } return array; } template void free2dArray(T **array, int rows, int cols) { - for (int i = 0; i < rows; i ++) { + for (int i = 0; i < rows; i++) { free(array[i]); } free(array); @@ -130,11 +133,11 @@ void free2dArray(T **array, int rows, int cols) { template T *** alloc3dArray(int rows, int cols, int depth) { - T ***array = (T ***) malloc(sizeof(T **) * rows); - for (int i = 0; i < rows; i ++) { - array[i] = (T **) malloc(sizeof(T *) * cols); - for (int j = 0; j < cols; j ++) { - array[i][j] = (T*) malloc(sizeof(T) * depth); + T ***array = static_cast(malloc(sizeof(T **) * rows)); + for (int i = 0; i < rows; i++) { + array[i] = static_cast(malloc(sizeof(T *) * cols)); + for (int j = 0; j < cols; j++) { + array[i][j] = static_cast(malloc(sizeof(T) * depth)); } } return array; @@ -142,8 +145,8 @@ T *** alloc3dArray(int rows, int cols, int depth) { template void free3dArray(T *** array, int rows, int cols, int depth) { - for (int i = 0; i < rows; i ++) { - for (int j = 0; j < cols; j ++) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { free(array[i][j]); } free(array[i]); @@ -153,7 +156,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) { void * allocBuffer(size_t num_bytes) { #ifdef NO_SIM - return VTAMemAlloc(num_bytes, CACHED); + return VTAMemAlloc(num_bytes, VTA_CACHED); #else return malloc(num_bytes); #endif @@ -173,7 +176,7 @@ VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, in union VTAInsn converter; // Memory instruction initialization VTAMemInsn insn = {}; - insn.opcode = OPCODE_LOAD; + insn.opcode = VTA_OPCODE_LOAD; insn.pop_prev_dep = pop_prev_dep; insn.pop_next_dep = pop_next_dep; insn.push_prev_dep = push_prev_dep; @@ -250,7 +253,7 @@ VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat, union VTAInsn converter; // GEVM instruction initialization VTAGemInsn insn; - insn.opcode = OPCODE_GEMM; + insn.opcode = VTA_OPCODE_GEMM; insn.pop_prev_dep = pop_prev_dep; insn.pop_next_dep = pop_next_dep; insn.push_prev_dep = push_prev_dep; @@ -288,7 +291,7 @@ VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bo union VTAInsn converter; // Memory instruction initialization VTAAluInsn insn = {}; - insn.opcode = OPCODE_ALU; + insn.opcode = VTA_OPCODE_ALU; insn.pop_prev_dep = pop_prev_dep; insn.pop_next_dep = pop_next_dep; insn.push_prev_dep = push_prev_dep; @@ -327,7 +330,7 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) { union VTAInsn converter; // GEVM instruction initialization VTAGemInsn insn; - insn.opcode = OPCODE_FINISH; + insn.opcode = VTA_OPCODE_FINISH; insn.pop_prev_dep = pop_prev; insn.pop_next_dep = pop_next; insn.push_prev_dep = 0; @@ -347,21 +350,20 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) { } VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) { - // Derive the total uop size int uop_size = (uop_compression) ? 1 : y_size * x_size; // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = static_cast(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED)); #else - VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); + VTAUop *uop_buf = static_cast(malloc(sizeof(VTAUop) * uop_size)); #endif if (!uop_compression) { int uop_idx = 0; - for (int i = 0; i < y_size; i ++) { - for (int j = 0; j < x_size; j ++) { + for (int i = 0; i < y_size; i++) { + for (int j = 0; j < x_size; j++) { uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].dst_idx = i * x_size + j; uop_buf[uop_idx].src_idx = 0; @@ -381,23 +383,22 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) { VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, bool multi_threaded) { - // Derive the total uop size int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat; if (multi_threaded) uop_size *= 2; // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = static_cast(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED)); #else - VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); + VTAUop *uop_buf = static_cast(malloc(sizeof(VTAUop) * uop_size)); #endif if (!uop_compression) { int uop_idx = 0; - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < in_feat; j ++) { - for (int k = 0; k < out_feat; k ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < in_feat; j++) { + for (int k = 0; k < out_feat; k++) { uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].dst_idx = i * out_feat + k; uop_buf[uop_idx].src_idx = i * in_feat + j; @@ -407,7 +408,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, } } } else { - for (int i = 0; i < batch; i ++) { + for (int i = 0; i < batch; i++) { uop_buf[i].reset_out = false; uop_buf[i].dst_idx = i * out_feat; uop_buf[i].src_idx = i * in_feat; @@ -418,9 +419,9 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, if (multi_threaded) { if (!uop_compression) { int uop_idx = uop_size / 2; - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < in_feat; j ++) { - for (int k = 0; k < out_feat; k ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < in_feat; j++) { + for (int k = 0; k < out_feat; k++) { uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].dst_idx = i * out_feat + k; uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j; @@ -430,7 +431,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, } } } else { - for (int i = 0; i < batch; i ++) { + for (int i = 0; i < batch; i++) { uop_buf[batch+i].reset_out = false; uop_buf[batch+i].dst_idx = i * out_feat; uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat; @@ -443,19 +444,18 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, } VTAUop * getMapALUUops(int vector_size, bool uop_compression) { - // Derive the total uop size int uop_size = (uop_compression) ? 1 : vector_size; // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = static_cast(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED)); #else - VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); + VTAUop *uop_buf = static_cast(malloc(sizeof(VTAUop) * uop_size)); #endif if (!uop_compression) { - for (int i = 0; i < vector_size; i ++) { + for (int i = 0; i < vector_size; i++) { uop_buf[i].reset_out = 0; uop_buf[i].dst_idx = i; uop_buf[i].src_idx = vector_size + i; @@ -473,65 +473,65 @@ void printParameters() { // Some debugging code printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn)); printf("Size of VTAUop: %d\n", sizeof(VTAUop)); - printf("UOP_BUFF_DEPTH: %d\n", UOP_BUFF_DEPTH); - printf("LOG_UOP_BUFF_DEPTH: %d\n", LOG_UOP_BUFF_DEPTH); - printf("WGT_BUFF_DEPTH: %d\n", WGT_BUFF_DEPTH); - printf("LOG_WGT_BUFF_DEPTH: %d\n", LOG_WGT_BUFF_DEPTH); - printf("INP_BUFF_DEPTH: %d\n", INP_BUFF_DEPTH); - printf("LOG_INP_BUFF_DEPTH: %d\n", LOG_INP_BUFF_DEPTH); - printf("ACC_BUFF_DEPTH: %d\n", ACC_BUFF_DEPTH); - printf("LOG_ACC_BUFF_DEPTH: %d\n", LOG_ACC_BUFF_DEPTH); - printf("WGT_WORDS: %d\n", WGT_BUFF_DEPTH*BLOCK_IN*BLOCK_OUT); - printf("INP_WORDS: %d\n", INP_BUFF_DEPTH*BLOCK_IN); - printf("ACC_WORDS: %d\n", ACC_BUFF_DEPTH*BLOCK_OUT); - printf("INS_ELEM_BYTES: %d\n", INS_ELEM_BYTES); - printf("UOP_ELEM_BYTES: %d\n", UOP_ELEM_BYTES); - printf("INP_ELEM_BYTES: %d\n", INP_ELEM_BYTES); - printf("WGT_ELEM_BYTES: %d\n", WGT_ELEM_BYTES); - printf("ACC_ELEM_BYTES: %d\n", ACC_ELEM_BYTES); - printf("BLOCK_IN: %d\n", BLOCK_IN); - printf("BLOCK_OUT: %d\n", BLOCK_OUT); - printf("INSN_MEM_0 [%d-%d]\n", INSN_MEM_0_0, INSN_MEM_0_1); - printf("INSN_MEM_1 [%d]\n", INSN_MEM_1); - printf("INSN_MEM_2 [%d]\n", INSN_MEM_2); - printf("INSN_MEM_3 [%d]\n", INSN_MEM_3); - printf("INSN_MEM_4 [%d]\n", INSN_MEM_4); - printf("INSN_MEM_5 [%d-%d]\n", INSN_MEM_5_0, INSN_MEM_5_1); - printf("INSN_MEM_6 [%d-%d]\n", INSN_MEM_6_0, INSN_MEM_6_1); - printf("INSN_MEM_7 [%d-%d]\n", INSN_MEM_7_0, INSN_MEM_7_1); - printf("INSN_MEM_8 [%d-%d]\n", INSN_MEM_8_0, INSN_MEM_8_1); - printf("INSN_MEM_9 [%d-%d]\n", INSN_MEM_9_0, INSN_MEM_9_1); - printf("INSN_MEM_A [%d-%d]\n", INSN_MEM_A_0, INSN_MEM_A_1); - printf("INSN_MEM_B [%d-%d]\n", INSN_MEM_B_0, INSN_MEM_B_1); - printf("INSN_MEM_C [%d-%d]\n", INSN_MEM_C_0, INSN_MEM_C_1); - printf("INSN_MEM_D [%d-%d]\n", INSN_MEM_D_0, INSN_MEM_D_1); - printf("INSN_MEM_E [%d-%d]\n", INSN_MEM_E_0, INSN_MEM_E_1); - printf("INSN_GEM_0 [%d-%d]\n", INSN_GEM_0_0, INSN_GEM_0_1); - printf("INSN_GEM_1 [%d]\n", INSN_GEM_1); - printf("INSN_GEM_2 [%d]\n", INSN_GEM_2); - printf("INSN_GEM_3 [%d]\n", INSN_GEM_3); - printf("INSN_GEM_4 [%d]\n", INSN_GEM_4); - printf("INSN_GEM_5 [%d-%d]\n", INSN_GEM_5_0, INSN_GEM_5_1); - printf("INSN_GEM_6 [%d-%d]\n", INSN_GEM_6_0, INSN_GEM_6_1); - printf("INSN_GEM_7 [%d-%d]\n", INSN_GEM_7_0, INSN_GEM_7_1); - printf("INSN_GEM_8 [%d-%d]\n", INSN_GEM_8_0, INSN_GEM_8_1); - printf("INSN_GEM_9 [%d-%d]\n", INSN_GEM_9_0, INSN_GEM_9_1); - printf("INSN_GEM_A [%d-%d]\n", INSN_GEM_A_0, INSN_GEM_A_1); - printf("INSN_GEM_B [%d-%d]\n", INSN_GEM_B_0, INSN_GEM_B_1); - printf("INSN_GEM_C [%d-%d]\n", INSN_GEM_C_0, INSN_GEM_C_1); - printf("INSN_GEM_D [%d-%d]\n", INSN_GEM_D_0, INSN_GEM_D_1); - printf("INSN_GEM_E [%d-%d]\n", INSN_GEM_E_0, INSN_GEM_E_1); - printf("INSN_ALU_D [%d-%d]\n", INSN_ALU_D_0, INSN_ALU_D_1); - printf("INSN_ALU_E [%d]\n", INSN_ALU_E); - printf("INSN_ALU_F [%d-%d]\n", INSN_ALU_F_0, INSN_ALU_F_1); - printf("UOP_GEM_0 [%d]\n", UOP_GEM_0); - printf("UOP_GEM_1 [%d-%d]\n", UOP_GEM_1_0, UOP_GEM_1_1); - printf("UOP_GEM_2 [%d-%d]\n", UOP_GEM_2_0, UOP_GEM_2_1); - printf("UOP_GEM_3 [%d-%d]\n", UOP_GEM_3_0, UOP_GEM_3_1); - printf("UOP_ALU_0 [%d]\n", UOP_ALU_0); - printf("UOP_ALU_1 [%d-%d]\n", UOP_ALU_1_0, UOP_ALU_1_1); - printf("UOP_ALU_2 [%d-%d]\n", UOP_ALU_2_0, UOP_ALU_2_1); - printf("UOP_ALU_3 [%d-%d]\n", UOP_ALU_3_0, UOP_ALU_3_1); + printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH); + printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH); + printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH); + printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH); + printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH); + printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH); + printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH); + printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH); + printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT); + printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN); + printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT); + printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES); + printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES); + printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES); + printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES); + printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES); + printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN); + printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT); + printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1); + printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1); + printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2); + printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3); + printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4); + printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1); + printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1); + printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1); + printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1); + printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1); + printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1); + printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1); + printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1); + printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1); + printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1); + printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1); + printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1); + printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2); + printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3); + printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4); + printf("VTA_INSN_GEM_5 [%d-%d]\n", VTA_INSN_GEM_5_0, VTA_INSN_GEM_5_1); + printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1); + printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1); + printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1); + printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1); + printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1); + printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1); + printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1); + printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1); + printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1); + printf("VTA_INSN_ALU_D [%d-%d]\n", VTA_INSN_ALU_D_0, VTA_INSN_ALU_D_1); + printf("VTA_INSN_ALU_E [%d]\n", VTA_INSN_ALU_E); + printf("VTA_INSN_ALU_F [%d-%d]\n", VTA_INSN_ALU_F_0, VTA_INSN_ALU_F_1); + printf("VTA_UOP_GEM_0 [%d]\n", VTA_UOP_GEM_0); + printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1); + printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1); + printf("VTA_UOP_GEM_3 [%d-%d]\n", VTA_UOP_GEM_3_0, VTA_UOP_GEM_3_1); + printf("VTA_UOP_ALU_0 [%d]\n", VTA_UOP_ALU_0); + printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1); + printf("VTA_UOP_ALU_2 [%d-%d]\n", VTA_UOP_ALU_2_0, VTA_UOP_ALU_2_1); + printf("VTA_UOP_ALU_3 [%d-%d]\n", VTA_UOP_ALU_3_0, VTA_UOP_ALU_3_1); } void printInstruction(int num_insn, VTAGenericInsn *insns) { @@ -544,84 +544,111 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) { union VTAInsn c; // Iterate over all instructions printf("DEBUG - There are %u instructions\n", num_insn); - for (int i = 0; i < num_insn; i ++) { + for (int i = 0; i < num_insn; i++) { // Fetch instruction and decode opcode c.generic = insns[i]; printf("DEBUG - INSTRUCTION %u: ", i); - if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { // Print instruction field information - if (c.mem.opcode == OPCODE_LOAD) { + if (c.mem.opcode == VTA_OPCODE_LOAD) { printf("LOAD "); - if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n"); - if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n"); - if (c.mem.memory_type == MEM_ID_INP) printf("INP\n"); - if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n"); + if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n"); + if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n"); + if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n"); + if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n"); } - if (c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == VTA_OPCODE_STORE) { printf("STORE ACC\n"); } printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, - (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); - printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", (int) c.mem.dram_base, (int) c.mem.sram_base); - printf("\ty: size=%d, pad=[%d, %d]\n", (int) c.mem.y_size, (int) c.mem.y_pad_0, - (int) c.mem.y_pad_1); - printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", (int) c.mem.x_size, (int) c.mem.x_stride, - (int) c.mem.x_pad_0, (int) c.mem.x_pad_1); - if (c.mem.opcode == OPCODE_STORE) { - if (c.mem.pop_prev_dep) g2s_queue --; - if (c.mem.push_prev_dep) s2g_queue ++; - } else if (c.mem.opcode == OPCODE_LOAD && - (c.mem.memory_type == MEM_ID_INP || c.mem.memory_type == MEM_ID_WGT)) { - if (c.mem.pop_next_dep) g2l_queue --; - if (c.mem.push_next_dep) l2g_queue ++; + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", + static_cast(c.mem.dram_base), + static_cast(c.mem.sram_base)); + printf("\ty: size=%d, pad=[%d, %d]\n", + static_cast(c.mem.y_size), + static_cast(c.mem.y_pad_0), + static_cast(c.mem.y_pad_1)); + printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", + static_cast(c.mem.x_size), + static_cast(c.mem.x_stride), + static_cast(c.mem.x_pad_0), + static_cast(c.mem.x_pad_1)); + if (c.mem.opcode == VTA_OPCODE_STORE) { + if (c.mem.pop_prev_dep) g2s_queue--; + if (c.mem.push_prev_dep) s2g_queue++; + } else if (c.mem.opcode == VTA_OPCODE_LOAD && + (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) { + if (c.mem.pop_next_dep) g2l_queue--; + if (c.mem.push_next_dep) l2g_queue++; } else { - if (c.mem.pop_prev_dep) l2g_queue --; - if (c.mem.push_prev_dep) g2l_queue ++; - if (c.mem.pop_next_dep) s2g_queue --; - if (c.mem.push_next_dep) g2s_queue ++; + if (c.mem.pop_prev_dep) l2g_queue--; + if (c.mem.push_prev_dep) g2l_queue++; + if (c.mem.pop_next_dep) s2g_queue--; + if (c.mem.push_next_dep) g2s_queue++; } - } else if (c.mem.opcode == OPCODE_GEMM) { + } else if (c.mem.opcode == VTA_OPCODE_GEMM) { // Print instruction field information printf("GEVM\n"); printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, - (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); - printf("\trange (%d, %d)\n", (int) c.gemm.uop_bgn, (int) c.gemm.uop_end); - printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_out, - (int) c.gemm.dst_factor_out, (int) c.gemm.src_factor_out, - (int) c.gemm.wgt_factor_out); - printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_in, - (int) c.gemm.dst_factor_in, (int) c.gemm.src_factor_in, - (int) c.gemm.wgt_factor_in); - if (c.gemm.pop_prev_dep) l2g_queue --; - if (c.gemm.push_prev_dep) g2l_queue ++; - if (c.gemm.pop_next_dep) s2g_queue --; - if (c.gemm.push_next_dep) g2s_queue ++; - } else if (c.mem.opcode == OPCODE_FINISH) { + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\trange (%d, %d)\n", + static_cast(c.gemm.uop_bgn), + static_cast(c.gemm.uop_end)); + printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", + static_cast(c.gemm.iter_out), + static_cast(c.gemm.dst_factor_out), + static_cast(c.gemm.src_factor_out), + static_cast(c.gemm.wgt_factor_out)); + printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", + static_cast(c.gemm.iter_in), + static_cast(c.gemm.dst_factor_in), + static_cast(c.gemm.src_factor_in), + static_cast(c.gemm.wgt_factor_in)); + if (c.gemm.pop_prev_dep) l2g_queue--; + if (c.gemm.push_prev_dep) g2l_queue++; + if (c.gemm.pop_next_dep) s2g_queue--; + if (c.gemm.push_next_dep) g2s_queue++; + } else if (c.mem.opcode == VTA_OPCODE_FINISH) { printf("FINISH\n"); printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, - (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); - if (c.gemm.pop_prev_dep) l2g_queue --; - if (c.gemm.push_prev_dep) g2l_queue ++; - if (c.gemm.pop_next_dep) s2g_queue --; - if (c.gemm.push_next_dep) g2s_queue ++; - } else if (c.mem.opcode == OPCODE_ALU) { + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + if (c.gemm.pop_prev_dep) l2g_queue--; + if (c.gemm.push_prev_dep) g2l_queue++; + if (c.gemm.pop_next_dep) s2g_queue--; + if (c.gemm.push_next_dep) g2s_queue++; + } else if (c.mem.opcode == VTA_OPCODE_ALU) { // Print instruction field information printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, - (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); - printf("\trange (%d, %d)\n", (int) c.alu.uop_bgn, (int) c.alu.uop_end); - printf("\touter loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_out, - (int) c.alu.dst_factor_out, (int) c.alu.src_factor_out); - printf("\tinner loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_in, - (int) c.alu.dst_factor_in, (int) c.alu.src_factor_in); - if (c.alu.pop_prev_dep) l2g_queue --; - if (c.alu.push_prev_dep) g2l_queue ++; - if (c.alu.pop_next_dep) s2g_queue --; - if (c.alu.push_next_dep) g2s_queue ++; + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\trange (%d, %d)\n", + static_cast(c.alu.uop_bgn), + static_cast(c.alu.uop_end)); + printf("\touter loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_out), + static_cast(c.alu.dst_factor_out), + static_cast(c.alu.src_factor_out)); + printf("\tinner loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_in), + static_cast(c.alu.dst_factor_in), + static_cast(c.alu.src_factor_in)); + if (c.alu.pop_prev_dep) l2g_queue--; + if (c.alu.push_prev_dep) g2l_queue++; + if (c.alu.pop_next_dep) s2g_queue--; + if (c.alu.push_next_dep) g2s_queue++; } } printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); @@ -632,174 +659,193 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) { void printMicroOp(int num_uop, VTAUop *uops) { // Iterate over all micro ops printf("DEBUG - There are %u micro-ops\n", num_uop); - for (int i = 0; i < num_uop; i ++) { + for (int i = 0; i < num_uop; i++) { // Read micro-op printf("DEBUG - UOP %u: ", i); printf("rst_out=%u, acc=%u, inp= %u, wgt=%u\n", uops[i].reset_out, uops[i].dst_idx, uops[i].src_idx, uops[i].wgt_idx); - } } int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) { - - assert(batch % BATCH == 0); - assert(vector_size % BLOCK_OUT == 0); - assert(!(opcode == ALU_OPCODE_SHL && !use_imm)); - assert(!(opcode == ALU_OPCODE_SHR && !use_imm)); - + // Some assertions + assert(batch % VTA_BATCH == 0); + assert(vector_size % VTA_BLOCK_OUT == 0); + assert(!(opcode == VTA_ALU_OPCODE_SHL && !use_imm)); + assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm)); printf("=====================================================================================\n"); printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression); // Instruction count - int ins_size = 3 * batch / BATCH + 2; + int ins_size = 3 * batch / VTA_BATCH + 2; // Micro op count - int uop_size = uop_compression ? 1 : vector_size / BLOCK_OUT; + int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT; // Input/output elements in each transfer - int tx_size = vector_size / BLOCK_OUT; + int tx_size = vector_size / VTA_BLOCK_OUT; // Number of input sets to be generated int input_sets = (use_imm) ? 1 : 2; // Make sure we don't exceed buffer bounds - assert(uop_size <= UOP_BUFF_DEPTH); - assert(tx_size * input_sets <= ACC_BUFF_DEPTH); + assert(uop_size <= VTA_UOP_BUFF_DEPTH); + assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH); // Immediate values - acc_T *immediate = (acc_T *) malloc(sizeof(acc_T) * batch / BATCH); - for (int b = 0; b < batch / BATCH; b ++) { - if (opcode == ALU_OPCODE_MIN) { - immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_MAX) { - immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_ADD) { - immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_SUB) { - immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_MUL) { - immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_SHL) { - immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1)); - } else if (opcode == ALU_OPCODE_SHR) { - immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1)); + acc_T *immediate = static_cast(malloc(sizeof(acc_T) * batch / VTA_BATCH)); + for (int b = 0; b < batch / VTA_BATCH; b++) { + if (opcode == VTA_ALU_OPCODE_MIN) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_MAX) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_ADD) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_SUB) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_SHL) { + immediate[b] = static_cast(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1)); + } else if (opcode == VTA_ALU_OPCODE_SHR) { + immediate[b] = static_cast(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1)); } } // Initialize instructions - VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size); + VTAGenericInsn *insn_buf = + static_cast(allocBuffer(sizeof(VTAGenericInsn) * ins_size)); int insn_idx = 0; - insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); - for (int b = 0; b < batch; b += BATCH) { - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_LOAD, // opcode - MEM_ID_ACC, // vector size - 0, // sram offset - b / BATCH * tx_size * input_sets, // dram offset - 1, // y size - tx_size * input_sets, // x size - tx_size * input_sets, // x stride - 0, // y pad - 0, // x pad - 0, // pop prev dep - b > 0, // pop next dep - 0, // push prev dep - 0); // push next dep - insn_buf[insn_idx ++] = getALUInsn( - opcode, // opcode - tx_size, // vector size - use_imm, // use imm - immediate[b / BATCH], // imm - uop_compression, // uop compression - 0, // pop prev dep - 0, // pop next dep - 0, // push prev dep - 1); // push next dep - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_STORE, // opcode - MEM_ID_OUT, // vector size - 0, // sram offset - b / BATCH * tx_size, // dram offset - 1, // y size - tx_size, // x size - tx_size, // x stride - 0, // y pad - 0, // x pad - 1, // pop prev dep - 0, // pop next dep - 1, // push prev dep - 0); // push next dep + insn_buf[insn_idx++] = + get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); + for (int b = 0; b < batch; b += VTA_BATCH) { + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_ACC, // vector size + 0, // sram offset + b / VTA_BATCH * tx_size * input_sets, // dram offset + 1, // y size + tx_size * input_sets, // x size + tx_size * input_sets, // x stride + 0, // y pad + 0, // x pad + 0, // pop prev dep + b > 0, // pop next dep + 0, // push prev dep + 0); // push next dep + insn_buf[insn_idx++] = getALUInsn( + opcode, // opcode + tx_size, // vector size + use_imm, // use imm + immediate[b / VTA_BATCH], // imm + uop_compression, // uop compression + 0, // pop prev dep + 0, // pop next dep + 0, // push prev dep + 1); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_STORE, // opcode + VTA_MEM_ID_OUT, // vector size + 0, // sram offset + b / VTA_BATCH * tx_size, // dram offset + 1, // y size + tx_size, // x size + tx_size, // x stride + 0, // y pad + 0, // x pad + 1, // pop prev dep + 0, // pop next dep + 1, // push prev dep + 0); // push next dep } // Finish - insn_buf[insn_idx ++] = getFinishInsn(0, 1); - + insn_buf[insn_idx++] = getFinishInsn(0, 1); // Prepare the uop buffer VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression); -#if DEBUG==1 +#if VTA_DEBUG == 1 printInstruction(ins_size, insn_buf); printMicroOp(uop_size, uop_buf); #endif // Initialize the input/output data acc_T **inputs = alloc2dArray(batch, vector_size * input_sets); - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < vector_size * input_sets; j ++) { - if (opcode == ALU_OPCODE_MIN) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); - } else if (opcode == ALU_OPCODE_MAX) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); - } else if (opcode == ALU_OPCODE_ADD) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); - } else if (opcode == ALU_OPCODE_SUB) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); - } else if (opcode == ALU_OPCODE_MUL) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); - } else if (opcode == ALU_OPCODE_SHL) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); - } else if (opcode == ALU_OPCODE_SHR) { - inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size * input_sets; j++) { + if (opcode == VTA_ALU_OPCODE_MIN) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MAX) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_ADD) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_SUB) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); + } else if (opcode == VTA_ALU_OPCODE_SHL) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_SHR) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); } } } // Compute reference output out_T **outputs_ref = alloc2dArray(batch, vector_size); - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < vector_size; j ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size; j++) { acc_T tmp = 0; - if (opcode == ALU_OPCODE_MIN) { + if (opcode == VTA_ALU_OPCODE_MIN) { if (!use_imm) { - tmp = inputs[i][j] < inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size]; + tmp = inputs[i][j] < inputs[i][j + vector_size] ? + inputs[i][j] : + inputs[i][j + vector_size]; } else { - tmp = inputs[i][j] < immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH]; + tmp = inputs[i][j] < immediate[i / VTA_BATCH] ? + inputs[i][j] : + immediate[i / VTA_BATCH]; } - } else if (opcode == ALU_OPCODE_MAX) { + } else if (opcode == VTA_ALU_OPCODE_MAX) { if (!use_imm) { - tmp = inputs[i][j] > inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size]; + tmp = inputs[i][j] > inputs[i][j + vector_size] ? + inputs[i][j] : + inputs[i][j + vector_size]; } else { - tmp = inputs[i][j] > immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH]; + tmp = inputs[i][j] > immediate[i / VTA_BATCH] ? + inputs[i][j] : + immediate[i / VTA_BATCH]; } - } else if (opcode == ALU_OPCODE_ADD) { + } else if (opcode == VTA_ALU_OPCODE_ADD) { if (!use_imm) { tmp = inputs[i][j] + inputs[i][j + vector_size]; } else { - tmp = inputs[i][j] + immediate[i / BATCH]; + tmp = inputs[i][j] + immediate[i / VTA_BATCH]; } - } else if (opcode == ALU_OPCODE_SUB) { + } else if (opcode == VTA_ALU_OPCODE_SUB) { if (!use_imm) { tmp = inputs[i][j] - inputs[i][j + vector_size]; } else { - tmp = inputs[i][j] - immediate[i / BATCH]; + tmp = inputs[i][j] - immediate[i / VTA_BATCH]; } - } else if (opcode == ALU_OPCODE_MUL) { + } else if (opcode == VTA_ALU_OPCODE_MUL) { if (!use_imm) { tmp = inputs[i][j] * inputs[i][j + vector_size]; } else { - tmp = inputs[i][j] * immediate[i / BATCH]; + tmp = inputs[i][j] * immediate[i / VTA_BATCH]; } - } else if (opcode == ALU_OPCODE_SHL) { - tmp = inputs[i][j] << immediate[i / BATCH]; - } else if (opcode == ALU_OPCODE_SHR) { - tmp = inputs[i][j] >> immediate[i / BATCH]; + } else if (opcode == VTA_ALU_OPCODE_SHL) { + tmp = inputs[i][j] << immediate[i / VTA_BATCH]; + } else if (opcode == VTA_ALU_OPCODE_SHR) { + tmp = inputs[i][j] >> immediate[i / VTA_BATCH]; } // Set outputs_ref[i][j] = (out_T) tmp; @@ -807,44 +853,51 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp } // Pack input buffer - acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * batch * tx_size * input_sets); - packBuffer(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT); + acc_T *bias_buf = + static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets)); + packBuffer( + bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT); // Prepare output buffer - out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets); + out_T *output_buf = + static_cast(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets)); #ifdef NO_SIM // Invoke the VTA uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf); // Report on timining - printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6); - printf("INFO - Throughput: %.3lfGOps/s\n", (double) vector_size * batch / t_fpga); + printf("INFO - Synchronization time: %.3fms\n", static_cast(t_fpga) / 1E6); + printf("INFO - Throughput: %.3fGOps/s\n", static_cast(vector_size * batch) / t_fpga); #else // Invoke the VTA - vta( - ins_size, - (volatile insn_T *) insn_buf, - (volatile uop_T *) uop_buf, - (volatile inp_vec_T *) NULL, - (volatile wgt_vec_T *) NULL, - (volatile acc_vec_T *) bias_buf, - (volatile inp_vec_T *) output_buf - ); + vta(ins_size, + (volatile insn_T *) insn_buf, + (volatile uop_T *) uop_buf, + (volatile inp_vec_T *) NULL, + (volatile wgt_vec_T *) NULL, + (volatile acc_vec_T *) bias_buf, + (volatile inp_vec_T *) output_buf); #endif // Unpack output buffer out_T **outputs = alloc2dArray(batch, vector_size); - unpackBuffer(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT); + unpackBuffer(outputs, + output_buf, + batch, + vector_size, + VTA_BATCH, + VTA_BLOCK_OUT); // Correctness checks int err = 0; - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < vector_size; j ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size; j++) { if (outputs_ref[i][j] != outputs[i][j]) { err++; -#if DEBUG==1 - printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j], - (int) outputs[i][j]); +#if VTA_DEBUG == 1 + printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, + static_cast(outputs_ref[i][j]), + static_cast(outputs[i][j])); #endif } } @@ -867,169 +920,180 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp printf("INFO - ALU test failed, got %d errors!\n", err); return -1; } - } int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int virtual_threads) { - - assert(block % BLOCK_IN == 0); - assert(block % BLOCK_OUT == 0); - assert(block % BATCH == 0); + // Some assertions + assert(block % VTA_BLOCK_IN == 0); + assert(block % VTA_BLOCK_OUT == 0); + assert(block % VTA_BATCH == 0); assert(channels % block == 0); assert(batch % block == 0); printf("=====================================================================================\n"); - printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_compression=%d, \ -virtual_threads=%d\n", - batch, channels, block, uop_compression, virtual_threads); + printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n", + batch, channels, block, uop_compression, virtual_threads); // Input/output channels int in_feat = channels; int out_feat = channels; // Derive number of elements that need to be loaded/stored int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2; - int uop_size = uop_compression ? block / BATCH * virtual_threads : - block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads; - int inp_size = batch / BATCH * in_feat / BLOCK_IN; - int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT; - int out_size = batch / BATCH * out_feat / BLOCK_OUT; + int uop_size = uop_compression ? + block / VTA_BATCH * virtual_threads : + block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads; + int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN; + int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT; + int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT; // Blocked buffer sizes (in terms of elements) - int inp_block_size = block / BATCH * block / BLOCK_IN; - int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT; - int out_block_size = block / BATCH * block / BLOCK_OUT; + int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN; + int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT; + int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT; // Make sure we don't exceed buffer bounds - assert(uop_size <= UOP_BUFF_DEPTH); - assert(inp_block_size <= INP_BUFF_DEPTH); - assert(wgt_block_size <= WGT_BUFF_DEPTH); - assert(out_block_size <= ACC_BUFF_DEPTH); + assert(uop_size <= VTA_UOP_BUFF_DEPTH); + assert(inp_block_size <= VTA_INP_BUFF_DEPTH); + assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH); + assert(out_block_size <= VTA_ACC_BUFF_DEPTH); // Initialize instruction buffer - VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size); + VTAGenericInsn *insn_buf = + static_cast(allocBuffer(sizeof(VTAGenericInsn) * ins_size)); int insn_idx = 0; // Load uops - insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); + insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD, + VTA_MEM_ID_UOP, + 0, + 0, + uop_size, + 0, + 0, + 0, + 0); // Iterate over batch blocks for (int i = 0; i < batch; i += block) { // Iterate over output channel blocks for (int j = 0; j < out_feat; j += block) { // Load bias block (pop next if not first, push prev) - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_LOAD, // opcode - MEM_ID_ACC, // type - 0, // sram offset - (i / BATCH * out_feat + j) / BLOCK_OUT, // dram offset - block / BATCH, // y size - block / BLOCK_OUT, // x size - out_feat / BLOCK_OUT, // x stride - 0, // y pad - 0, // x pad - 0, // pop prev dep - (i > 0 || j > 0), // pop next dep - (virtual_threads == 1), // push prev dep - 0); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_ACC, // type + 0, // sram offset + (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset + block / VTA_BATCH, // y size + block / VTA_BLOCK_OUT, // x size + out_feat / VTA_BLOCK_OUT, // x stride + 0, // y pad + 0, // x pad + 0, // pop prev dep + (i > 0 || j > 0), // pop next dep + (virtual_threads == 1), // push prev dep + 0); // push next dep // Iterate over input channel blocks for (int k = 0; k < in_feat; k += block * virtual_threads) { for (int l = 0; l < block * virtual_threads; l += block) { // Derive dependence flags - bool pop = (virtual_threads == 1) ? - 1 : - (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block); + bool pop = (virtual_threads == 1) ? + 1 : + (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block); bool push_prev = (virtual_threads == 1) ? - ((k + l) != in_feat - block) : - ((k + l) != in_feat - virtual_threads * block) && - ( - (k + l != in_feat - block) || - (j != out_feat - block) || - (i != batch - block) - ); + ((k + l) != in_feat - block) : + ((k + l) != in_feat - virtual_threads * block) && + ( + (k + l != in_feat - block) || + (j != out_feat - block) || + (i != batch - block)); bool push_next = (k + l == in_feat - block); // Load weight block (pop next) - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_LOAD, // opcode - MEM_ID_WGT, // type - l / BLOCK_IN * block / BLOCK_OUT, // sram offset - (j / BLOCK_OUT * in_feat + k + l) / BLOCK_IN, // dram offset - block / BLOCK_OUT, // y size - block / BLOCK_IN, // x size - in_feat / BLOCK_IN, // x stride - 0, // y pad - 0, // x pad - 0, // pop prev dep - pop, // pop next dep - 0, // push prev dep - 0); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_WGT, // type + l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT, // sram offset + (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN, // dram offset + block / VTA_BLOCK_OUT, // y size + block / VTA_BLOCK_IN, // x size + in_feat / VTA_BLOCK_IN, // x stride + 0, // y pad + 0, // x pad + 0, // pop prev dep + pop, // pop next dep + 0, // push prev dep + 0); // push next dep // Load input block (push next) - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_LOAD, // opcode - MEM_ID_INP, // type - l / BLOCK_IN * block / BATCH, // sram offset - (i / BATCH * in_feat + k + l) / BLOCK_IN, // dram offset - block / BATCH, // y size - block / BLOCK_IN, // x size - in_feat / BLOCK_IN, // x stride - 0, // y pad - 0, // x pad - 0, // pop prev dep - 0, // pop next dep - 0, // push prev dep - 1); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_INP, // type + l / VTA_BLOCK_IN * block / VTA_BATCH, // sram offset + (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN, // dram offset + block / VTA_BATCH, // y size + block / VTA_BLOCK_IN, // x size + in_feat / VTA_BLOCK_IN, // x stride + 0, // y pad + 0, // x pad + 0, // pop prev dep + 0, // pop next dep + 0, // push prev dep + 1); // push next dep // Perform GEMM (pop prev, push prev if not last, push next if last) - insn_buf[insn_idx ++] = getGEMMInsn( - l / block * uop_size / virtual_threads, // uop offset - block / BATCH, // batch - block / BLOCK_IN, // in_feat - block / BLOCK_OUT, // out_feat - uop_compression, // uop_compression - 1, // pop_prev_dep - 0, // pop_next_dep - push_prev, // push prev dep - push_next); // push_next_dep + insn_buf[insn_idx++] = getGEMMInsn( + l / block * uop_size / virtual_threads, // uop offset + block / VTA_BATCH, // batch + block / VTA_BLOCK_IN, // in_feat + block / VTA_BLOCK_OUT, // out_feat + uop_compression, // uop_compression + 1, // pop_prev_dep + 0, // pop_next_dep + push_prev, // push prev dep + push_next); // push_next_dep } } // Store output block (pop prev, push prev if not last) - insn_buf[insn_idx ++] = get2DLoadStoreInsn( - OPCODE_STORE, // opcode - MEM_ID_OUT, // type - 0, // sram offset - (i / BATCH * out_feat + j) / BLOCK_OUT, // dram offset - block / BATCH, // y size - block / BLOCK_OUT, // x size - out_feat / BLOCK_OUT, // x stride - 0, // y pad - 0, // x pad - 1, // pop prev dep - 0, // pop next dep - 1, // pop prev dep - 0); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_STORE, // opcode + VTA_MEM_ID_OUT, // type + 0, // sram offset + (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset + block / VTA_BATCH, // y size + block / VTA_BLOCK_OUT, // x size + out_feat / VTA_BLOCK_OUT, // x stride + 0, // y pad + 0, // x pad + 1, // pop prev dep + 0, // pop next dep + 1, // pop prev dep + 0); // push next dep } } // Finish - insn_buf[insn_idx ++] = getFinishInsn(0, 1); + insn_buf[insn_idx++] = getFinishInsn(0, 1); // Prepare the uop buffer - VTAUop * uop_buf = getGEMMUops(block / BATCH, block / BLOCK_IN, block / BLOCK_OUT, uop_compression, - virtual_threads > 1); - -#if DEBUG==1 + VTAUop * uop_buf = getGEMMUops( + block / VTA_BATCH, + block / VTA_BLOCK_IN, + block / VTA_BLOCK_OUT, + uop_compression, + virtual_threads > 1); + +#if VTA_DEBUG == 1 printInstruction(ins_size, insn_buf); printMicroOp(uop_size, uop_buf); #endif // Initialize inputs - inp_T **inputs = allocInit2dArray(batch, in_feat); + inp_T **inputs = allocInit2dArray(batch, in_feat); // Initialize weights - wgt_T **weights = allocInit2dArray(out_feat, in_feat); + wgt_T **weights = allocInit2dArray(out_feat, in_feat); // Initialize biases - acc_T **biases = allocInit2dArray(batch, out_feat); + acc_T **biases = allocInit2dArray(batch, out_feat); // Reference GEMM implementation out_T **outputs_ref = alloc2dArray(batch, out_feat); - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < out_feat; j ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < out_feat; j++) { acc_T sum = biases[i][j]; - for (int k = 0; k < in_feat; k ++) { + for (int k = 0; k < in_feat; k++) { sum += (acc_T) (inputs[i][k] * weights[j][k]); } // Set @@ -1038,49 +1102,75 @@ virtual_threads=%d\n", } // Prepare the input buffer - inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size); - packBuffer(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN); + inp_T *input_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); + packBuffer(input_buf, + inputs, + batch, + in_feat, + VTA_BATCH, + VTA_BLOCK_IN); // Prepare the weight buffer - wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size); - packBuffer(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN); + wgt_T *weight_buf = static_cast(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); + packBuffer(weight_buf, + weights, + out_feat, + in_feat, + VTA_BLOCK_OUT, + VTA_BLOCK_IN); // Prepare the bias buffer - acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size); - packBuffer(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT); + acc_T *bias_buf = static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); + packBuffer(bias_buf, + biases, + batch, + out_feat, + VTA_BATCH, + VTA_BLOCK_OUT); // Prepare the output buffer - out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size); + out_T *output_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * out_size)); #ifdef NO_SIM // Invoke the VTA - uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, input_buf, weight_buf, bias_buf, output_buf); + uint64_t t_fpga = vta(ins_size, + insn_buf, + uop_buf, + input_buf, + weight_buf, + bias_buf, + output_buf); // Report on timining - printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6); - printf("INFO - Throughput: %.3lfGOPs/s\n", (double) batch * in_feat * out_feat * 2 / t_fpga); + printf("INFO - Synchronization time: %.3lfms\n", static_cast(t_fpga) / 1E6); + printf("INFO - Throughput: %.3lfGOPs/s\n", + static_cast(batch) * in_feat * out_feat * 2 / t_fpga); #else // Invoke the VTA - vta( - ins_size, - (volatile insn_T *) insn_buf, - (volatile uop_T *) uop_buf, - (volatile inp_vec_T *) input_buf, - (volatile wgt_vec_T *) weight_buf, - (volatile acc_vec_T *) bias_buf, - (volatile inp_vec_T *) output_buf - ); + vta(ins_size, + (volatile insn_T *) insn_buf, + (volatile uop_T *) uop_buf, + (volatile inp_vec_T *) input_buf, + (volatile wgt_vec_T *) weight_buf, + (volatile acc_vec_T *) bias_buf, + (volatile inp_vec_T *) output_buf); #endif // Unpack output data out_T **outputs = alloc2dArray(batch, out_feat); - unpackBuffer(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT); + unpackBuffer(outputs, + output_buf, + batch, + out_feat, + VTA_BATCH, + VTA_BLOCK_OUT); // Correctness checks int err = 0; - for (int i = 0; i < batch; i ++) { - for (int j = 0; j < out_feat; j ++) { + for (int i = 0; i < batch; i++) { + for (int j = 0; j < out_feat; j++) { if (outputs_ref[i][j] != outputs[i][j]) { err++; -#if DEBUG==1 - printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j], - (int) outputs[i][j]); +#if VTA_DEBUG == 1 + printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, + static_cast(outputs_ref[i][j]), + static_cast(outputs[i][j])); #endif } } @@ -1092,12 +1182,12 @@ virtual_threads=%d\n", free2dArray(biases, batch, out_feat); free2dArray(outputs_ref, batch, out_feat); free2dArray(outputs, batch, out_feat); - freeBuffer((void *) insn_buf); - freeBuffer((void *) uop_buf); - freeBuffer((void *) input_buf); - freeBuffer((void *) weight_buf); - freeBuffer((void *) bias_buf); - freeBuffer((void *) output_buf); + freeBuffer(insn_buf); + freeBuffer(uop_buf); + freeBuffer(input_buf); + freeBuffer(weight_buf); + freeBuffer(bias_buf); + freeBuffer(output_buf); if (err == 0) { printf("INFO - Blocked GEMM test successful!\n"); @@ -1106,5 +1196,4 @@ virtual_threads=%d\n", printf("INFO - Blocked GEMM test failed, got %d errors!\n", err); return -1; } - } diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h index fad2e4daddfb..037e2fcee72f 100644 --- a/vta/tests/hardware/common/test_lib.h +++ b/vta/tests/hardware/common/test_lib.h @@ -4,8 +4,8 @@ * \brief Test library for the VTA design simulation and driver tests. */ -#ifndef VTA_TESTLIB_H_ -#define VTA_TESTLIB_H_ +#ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_ +#define TESTS_HARDWARE_COMMON_TEST_LIB_H_ #include #include @@ -17,9 +17,9 @@ #include -#ifdef PYNQ_TARGET +#ifdef VTA_PYNQ_TARGET #include "../../../src/pynq/pynq_driver.h" -#endif //PYNQ_TARGET +#endif // VTA_PYNQ_TARGET typedef uint64_t axi_T; typedef uint32_t uop_T; @@ -28,7 +28,7 @@ typedef int8_t inp_T; typedef int8_t out_T; typedef int32_t acc_T; -uint64_t vta ( +uint64_t vta( uint32_t insn_count, VTAGenericInsn *insns, VTAUop *uops, @@ -37,11 +37,11 @@ uint64_t vta ( acc_T *biases, inp_T *outputs); -#else //NO_SIM +#else // NO_SIM #include "../../../hardware/vivado/src/vta.h" -#endif //NO_SIM +#endif // NO_SIM /*! * \brief Returns opcode string. @@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int virtual_threads); -#endif // VTA_TESTLIB_H_ \ No newline at end of file +#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_ diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/pynq/metal_test.cc index b5147399c18c..01e73f46bd72 100644 --- a/vta/tests/hardware/pynq/metal_test.cc +++ b/vta/tests/hardware/pynq/metal_test.cc @@ -14,140 +14,135 @@ #include "../common/test_lib.h" // VTA invocation (present the same abstraction as in the simulation tests) -uint64_t vta ( - uint32_t insn_count, - VTAGenericInsn *insns, - VTAUop *uops, - inp_T *inputs, - wgt_T *weights, - acc_T *biases, - inp_T *outputs) { - - // Performance counter variables - uint64_t t_fpga; - struct timespec start, stop; - - // Derive bitstream file - char bitstream[64]; - char str_batch_size[4]; - char str_block_out_size[4]; - char str_block_in_size[4]; - char str_block_bit_width[4]; - sprintf(str_batch_size, "%d", BATCH); - sprintf(str_block_out_size, "%d", BLOCK_OUT); - sprintf(str_block_in_size, "%d", BLOCK_IN); - sprintf(str_block_bit_width, "%d", WGT_WIDTH); - strcpy(bitstream, "vta.bit"); - -#if DEBUG==1 - printf("INFO - Programming FPGA: %s!\n", bitstream); +uint64_t vta( + uint32_t insn_count, + VTAGenericInsn *insns, + VTAUop *uops, + inp_T *inputs, + wgt_T *weights, + acc_T *biases, + inp_T *outputs) { + // Performance counter variables + uint64_t t_fpga; + struct timespec start, stop; + + // Derive bitstream file + char bitstream[128]; + char str_batch_size[4]; + char str_block_out_size[4]; + char str_block_in_size[4]; + char str_block_bit_width[4]; + snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH); + snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT); + snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN); + snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH); + snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit"); + +#if VTA_DEBUG == 1 + printf("INFO - Programming FPGA: %s!\n", bitstream); #endif - // Program VTA - VTAProgram(bitstream); - // Get VTA handles - VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); - VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); - VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); - - // Physical address pointers - uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; - uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; - uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; - uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; - uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; - uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; - -#if DEBUG==1 - printf("INFO - Starting FPGA!\n"); + // Program VTA + VTAProgram(bitstream); + // Get VTA handles + VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); + VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); + VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); + VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + + // Physical address pointers + uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; + uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; + uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; + uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; + uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; + uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; + +#if VTA_DEBUG == 1 + printf("INFO - Starting FPGA!\n"); #endif - clock_gettime(CLOCK_REALTIME, &start); - - // FETCH @ 0x10 : Data signal of insn_count_V - VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); - // FETCH @ 0x18 : Data signal of insns_V - if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); - // LOAD @ 0x10 : Data signal of inputs_V - if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); - // LOAD @ 0x18 : Data signal of weight_V - if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); - // COMPUTE @ 0x20 : Data signal of uops_V - if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); - // COMPUTE @ 0x28 : Data signal of biases_V - if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); - // STORE @ 0x10 : Data signal of outputs_V - if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); - - // VTA start - VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); - VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); - VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); - VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); - - int flag = 0, t = 0; - for (t = 0; t < 10000000; ++t) { - flag = VTAReadMappedReg(vta_compute_handle, 0x18); - if (flag & VTA_DONE) break; - } - - if (t==10000000) { - printf("\tWARNING: VTA TIMEOUT!!!!\n"); - } -#if DEBUG==1 - else { - printf("INFO - FPGA Finished!\n"); - } + clock_gettime(CLOCK_REALTIME, &start); + + // FETCH @ 0x10 : Data signal of insn_count_V + VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); + // FETCH @ 0x18 : Data signal of insns_V + if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); + // LOAD @ 0x10 : Data signal of inputs_V + if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); + // LOAD @ 0x18 : Data signal of weight_V + if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); + // COMPUTE @ 0x20 : Data signal of uops_V + if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); + // COMPUTE @ 0x28 : Data signal of biases_V + if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); + // STORE @ 0x10 : Data signal of outputs_V + if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); + + // VTA start + VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); + VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); + + int flag = 0, t = 0; + for (t = 0; t < 10000000; ++t) { + flag = VTAReadMappedReg(vta_compute_handle, 0x18); + if (flag & VTA_DONE) break; + } + + if (t == 10000000) { + printf("\tWARNING: VTA TIMEOUT!!!!\n"); +#if VTA_DEBUG == 1 + } else { + printf("INFO - FPGA Finished!\n"); #endif + } - clock_gettime(CLOCK_REALTIME, &stop); - t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); + clock_gettime(CLOCK_REALTIME, &stop); + t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); - // Unmap VTA register - VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); - VTAUnmapRegister(vta_load_handle, VTA_RANGE); - VTAUnmapRegister(vta_compute_handle, VTA_RANGE); - VTAUnmapRegister(vta_store_handle, VTA_RANGE); + // Unmap VTA register + VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); + VTAUnmapRegister(vta_load_handle, VTA_RANGE); + VTAUnmapRegister(vta_compute_handle, VTA_RANGE); + VTAUnmapRegister(vta_store_handle, VTA_RANGE); - return t_fpga; -}; - -int main(void) -{ + return t_fpga; +} -#if DEBUG==1 - printParameters(); +int main(void) { +#if VTA_DEBUG == 1 + printParameters(); #endif - int status = 0; - - // Run ALU test (vector-scalar operators) - status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true); - status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false); - status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true); - status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false); - status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true); - status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false); - - // Run ALU test (vector-vector operators) - status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true); - status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false); - status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true); - status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false); - - // Run blocked GEMM test - status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2); - status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2); - status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1); - status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1); - - if (status==0) { - printf("\nINFO - Unit tests successful!\n"); - } else { - printf("\nINTO - Unit tests failed!\n"); - } - - return status; - + int status = 0; + + // Run ALU test (vector-scalar operators) + status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false); + status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true); + status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false); + status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true); + status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false); + + // Run ALU test (vector-vector operators) + status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false); + status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true); + status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false); + + // Run blocked GEMM test + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1); + + if (status == 0) { + printf("\nINFO - Unit tests successful!\n"); + } else { + printf("\nINTO - Unit tests failed!\n"); + } + + return status; }