-
Notifications
You must be signed in to change notification settings - Fork 15
Kernel Development
This code is automatically generated:
/*[[[cog
import cog
from cog_acctempl import *
from dut_params import *
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
/*[[[cog
cog.outl("#ifndef __%s_HLS_H__" % dut.nm.upper())
cog.outl("#define __%s_HLS_H__" % dut.nm.upper())
]]]*/
#ifndef __MEMCPY_HLS_H__
#define __MEMCPY_HLS_H__
//[[[end]]] (checksum: f5d0a4d0e706ab8b40e85b21f0328629)
#ifndef __SYNTHESIS__
#include <type_traits>
#endif
#include "systemc.h"
#include "types.h"
#include "Config.h"
#include "ga_tlm_fifo.h"
#include "hls_utils.h"
/*[[[cog
for m in dut.modules.values():
cog.outl("#include \"%s.h\"" % m.nm)
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
/*[[[cog
cog.outl("class %s_hls : public sc_module" % dut.nm)
]]]*/
class memcpy_hls : public sc_module
//[[[end]]] (checksum: fa2560ce13aa08d8a581d079b39bf215)
{
public:
sc_in_clk clk;
sc_in<bool> rst;
// functional ports
sc_in<Config> config;
sc_in<bool> start;
sc_out<bool> done;
// memory ports
/*[[[cog
for p in dut.inps:
cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(RdReqPort(p.nm)),p.reqTy(),p.reqNmK()))
cog.outl("ga::tlm_fifo_%sin<%s > %s;" % (dut.isHier(RdRespPort(p.nm)),p.respTy(),p.respNmK()))
cog.outl("")
for p in dut.outs:
cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(WrReqPort(p.nm)),p.reqTy(),p.reqNmK()))
cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(WrDataPort(p.nm)),p.dataTy(),p.dataNmK()))
cog.outl("")
]]]*/
ga::tlm_fifo_out<MemTypedReadReqType<CacheLine> > inpReqOut;
ga::tlm_fifo_in<MemTypedReadRespType<CacheLine> > inpRespIn;
ga::tlm_fifo_out<MemTypedWriteReqType<CacheLine> > outReqOut;
ga::tlm_fifo_out<MemTypedWriteDataType<CacheLine> > outDataOut;
//[[[end]]] (checksum: fd9fde2da603878fca9336358740669c)
// Instantiate modules
/*[[[cog
for m in dut.modules.values():
cog.outl("%s %s_inst;" % (m.nm, m.nm))
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
// TLM fifos (between modules)
/*[[[cog
for f in dut.tlm_fifos:
cog.outl("ga::tlm_fifo<%s, %d> %s;" % (f.ty, f.capacity, f.nm))
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
// storage fifos (between threads in this module)
/*[[[cog
for f in dut.storage_fifos:
if dut.find_parent(dut.put_tbl[f.nm]).nm == dut.module.nm:
cog.outl("ga::ga_storage_fifo<%s, %d> %s;" % (f.ty, f.capacity, f.nm))
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
/*[[[cog
cog.outl("SC_HAS_PROCESS(%s_hls);" % (dut.nm,))
]]]*/
SC_HAS_PROCESS(memcpy_hls);
//[[[end]]] (checksum: 1639f171cef6dba259fd16954ea97228)
/*[[[cog
cog.outl("%s_hls(sc_module_name modname) :" % (dut.nm,))
]]]*/
memcpy_hls(sc_module_name modname) :
//[[[end]]] (checksum: 18fc9bafe7cadd2e6ff44e18aab1421e)
sc_module(modname)
, clk("clk"), rst("rst")
, config("config"), start("start"), done("done")
/*[[[cog
for p in dut.inps:
cog.outl(", {0}(\"{0}\")".format( p.reqNmK()))
cog.outl(", {0}(\"{0}\")".format( p.respNmK()))
for p in dut.outs:
cog.outl(", {0}(\"{0}\")".format( p.reqNmK()))
cog.outl(", {0}(\"{0}\")".format( p.dataNmK()))
for m in dut.modules.values():
cog.outl(", {0}(\"{0}\")".format( m.nm + "_inst"))
]]]*/
, inpReqOut("inpReqOut")
, inpRespIn("inpRespIn")
, outReqOut("outReqOut")
, outDataOut("outDataOut")
//[[[end]]] (checksum: 2881ddb8430a98cd05d5dae52488ecb0)
{
/*[[[cog
for c in dut.cthreads.values():
cog.outl("SC_CTHREAD(%s, clk.pos());" % (c.nm,))
cog.outl("async_reset_signal_is(rst, false);")
cog.outl("")
]]]*/
SC_CTHREAD(inp_fetcher, clk.pos());
async_reset_signal_is(rst, false);
SC_CTHREAD(inp_addr_gen, clk.pos());
async_reset_signal_is(rst, false);
SC_CTHREAD(out_addr_gen, clk.pos());
async_reset_signal_is(rst, false);
//[[[end]]] (checksum: 773ff4522c18e0df27ce11fb3e624133)
/*[[[cog
for p in dut.inps:
if "" == dut.isHier(RdReqPort(p.nm)):
cog.outl("%s.clk_rst(clk, rst);" % (p.reqNmK(),))
if "" == dut.isHier(RdRespPort(p.nm)):
cog.outl("%s.clk_rst(clk, rst);" % (p.respNmK(),))
for p in dut.outs:
if "" == dut.isHier(WrReqPort(p.nm)):
cog.outl("%s.clk_rst(clk, rst);" % (p.reqNmK(),))
if "" == dut.isHier(WrDataPort(p.nm)):
cog.outl("%s.clk_rst(clk, rst);" % (p.dataNmK(),))
]]]*/
inpReqOut.clk_rst(clk, rst);
inpRespIn.clk_rst(clk, rst);
outReqOut.clk_rst(clk, rst);
outDataOut.clk_rst(clk, rst);
//[[[end]]] (checksum: 0436c52fea41c23b502c1f072eb76632)
#ifndef USE_HLS
/*[[[cog
for f in dut.storage_fifos:
if dut.find_parent(dut.put_tbl[f.nm]).nm == dut.module.nm:
cog.outl("%s.clk_rst(clk, rst);" % f.nm)
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
#endif
/*[[[cog
for m in dut.modules.values():
for sig in ["clk","rst","start","config"]:
cog.outl("{0}_inst.{1}({1});".format( m.nm, sig))
if m.writes_to_done:
cog.outl("{0}_inst.{1}({1});".format( m.nm, "done"))
for p in dut.inps:
if m.portOf( RdReqPort(p.nm)):
cog.outl("{0}_inst.{1}({1});".format( m.nm, p.reqNmK()))
if m.portOf( RdRespPort(p.nm)):
cog.outl("{0}_inst.{1}({1});".format( m.nm, p.respNmK()))
for p in dut.outs:
if m.portOf( WrReqPort(p.nm)):
cog.outl("{0}_inst.{1}({1});".format( m.nm, p.reqNmK()))
if m.portOf( WrDataPort(p.nm)):
cog.outl("{0}_inst.{1}({1});".format( m.nm, p.dataNmK()))
for f in dut.tlm_fifos:
cog.outl("{0}_inst.{1}({1});".format( m.nm, f.nm))
]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
}
/*[[[cog
for c in dut.cthreads.values():
cog.outl("#include \"%s-%s.h\"" % (dut.nm,c.nm))
]]]*/
#include "memcpy-inp_fetcher.h"
#include "memcpy-inp_addr_gen.h"
#include "memcpy-out_addr_gen.h"
//[[[end]]] (checksum: 484e6e29ad0d99d170f7f9de8f0c3cb6)
};
/*[[[cog
cog.outl("#include \"%s_acc.h\"" % (dut.nm,))
]]]*/
#include "memcpy_acc.h"
//[[[end]]] (checksum: e868954c0cd9f92182731fd98ad0951c)
#ifndef NUM_AUS
#define NUM_AUS 1
#endif
#ifndef RD_CHANNELS
#define RD_CHANNELS NUM_AUS
#endif
#ifndef WR_CHANNELS
#define WR_CHANNELS NUM_AUS
#endif
#if NUM_AUS == 1
/*[[[cog
cog.outl("typedef %s_acc dut_t;" % (dut.nm,))
]]]*/
typedef memcpy_acc dut_t;
//[[[end]]] (checksum: 0ce887778cbdeebc316aa28c2a590876)
#else
/*[[[cog
cog.outl("#include \"%s_sched.h\"" % dut.nm)
cog.outl("")
cog.outl("typedef multi_acc_template_Np<NUM_AUS, %s_sched<NUM_AUS>, %s_acc, Config, RD_CHANNELS, WR_CHANNELS> dut_t;" % (dut.nm,dut.nm))
]]]*/
#include "memcpy_sched.h"
typedef multi_acc_template_Np<NUM_AUS, memcpy_sched<NUM_AUS>, memcpy_acc, Config, RD_CHANNELS, WR_CHANNELS> dut_t;
//[[[end]]] (checksum: a810fe1d315e268751e5f964942c0ebb)
#endif
#endif
We only need to add a small amount of custom code to make the accelerator perform the copy operations. Because we added the basic communication structure for the three cthreads to the dut_params.py
file, most of the SystemC boilerplate has already been created for us. For example, the SC_CTHREAD
and async_reset_signal_is
statements are already filled in.
SC_CTHREAD(inp_fetcher, clk.pos());
async_reset_signal_is(rst, false);
SC_CTHREAD(inp_addr_gen, clk.pos());
async_reset_signal_is(rst, false);
SC_CTHREAD(out_addr_gen, clk.pos());
async_reset_signal_is(rst, false);
The code we need to add will be in the three thread specific header files: memcpy-inp_fetcher.h
, memcpy-inp_addr_gen.h
, and memcpy-out_addr_gen.h
. We'll work on the first one---the thread that fetches responses from the inp
port of the memory system and then sends this data to the out
port of the memory system.
The template system creates a starting implementation as follows:
/*[[[cog
import cog
from cog_acctempl import *
from dut_params import dut
if "thread_nm" not in globals():
lst = cog.previous.lstrip('/').rstrip('\n').split('=')
assert( lst[0]=="thread_nm")
assert( len(lst)==2)
global thread_nm
thread_nm = lst[1]
cog.outl( "//thread_nm=" + thread_nm)
]]]*/
//thread_nm=inp_fetcher
//[[[end]]] (checksum: b3cc55cdf47df8e2833f7683e251c994)
// Declare helper methods and class variables
/*[[[cog
c = dut.get_cthread(thread_nm)
cog.outl("void %s() {" % (c.nm,))
for p in c.ports:
cog.outl(" %s;" % p.reset)
]]]*/
void inp_fetcher() {
inpRespIn.reset_get();
outDataOut.reset_put();
//[[[end]]] (checksum: 78e3954429f1b9eaa6d398a4359e0c54)
// Declare and initialize local variables
/*[[[cog
if c.writes_to_done:
cog.outl("done = false;")
]]]*/
done = false;
//[[[end]]] (checksum: 872c87e47f55883a9563054415885245)
wait();
while (1) {
if ( start) {
// Fill in here
}
wait();
}
}
There is some complex cog
code at the top of the file that is used to remember the thread name in case you want to regenerate the interface after a change to the dut_params.py
file. The interesting part starts with the automatically generated reset of the two memory ports, inpRespIn
and outDataOut
.
These ports were assigned to this thread and that is why the reset code was generated.
We can now add the code needed to preform our simple computation.
We need to count the number of cache lines that have been processed so we know when we are done.
So we declare the local variable ip
and initialize it to zero.
inpRespIn.reset_get();
outDataOut.reset_put();
unsigned int ip = 0;
done = false;
wait();
The assignment of false
to done
has also generated because the thread had the property writes_to_done
specified in its dut_params.py
description.
This completes the reset block of the thread.
In the main execution loop, we check to see if our loop counter has reached its final value, and if not, we perform a get
operation on the inpRespIn
port followed by a put
operation on the outDataOut
port. Otherwise, we raise the done
flag.
while (1) {
if ( start) {
if ( ip != config.read().get_nCLs()) {
MemTypedReadRespType<CacheLine> wrapped_cl = inpRespIn.get();
outDataOut.put(MemTypedWriteDataType<CacheLine>( wrapped_cl.data));
++ip;
} else {
done = true;
}
}
wait();
}
The start
, done
, and config
variables are sc_signal
s specified in the (generated) interface of the model.
See the Memory API page for more details on the memory system including the different types that are available. See the HLS IOLib page for information on how to perform blocking and non-blocking put and get communications.
Here is the final code for this thread:
void inp_fetcher() {
inpRespIn.reset_get();
outDataOut.reset_put();
unsigned int ip = 0;
done = false;
wait();
while (1) {
if ( start) {
if ( ip != config.read().get_nCLs()) {
MemTypedReadRespType<CacheLine> wrapped_cl = inpRespIn.get();
outDataOut.put(MemTypedWriteDataType<CacheLine>( wrapped_cl.data));
++ip;
} else {
done = true;
}
}
wait();
}
}
We can also quickly code up the inp
address request thread. Here we want to put
a single request to inpReqOut
with the starting address and number of cachelines.
void inp_addr_gen() {
inpReqOut.reset_put();
bool already_sent = false;
wait();
while (1) {
if ( start) {
if ( !already_sent) {
inpReqOut.put(MemTypedReadReqType<CacheLine>( config.read().getInpAddr( 0), config.read().get_nCLs()));
already_sent = true;
}
}
wait();
}
}
The state variable already_sent
is used to ensure the request is only sent once.
Similar code is used for the out
address request thread.
void out_addr_gen() {
outReqOut.reset_put();
bool already_sent = false;
wait();
while (1) {
if ( start) {
if ( !already_sent) {
outReqOut.put(MemTypedWriteReqType<CacheLine>( config.read().getOutAddr( 0), config.read().get_nCLs()));
already_sent = true;
}
}
wait();
}
}
We can now link the software driver code from our earlier page together with our kernel implementation. Here are the first few lines of the driver code again:
#include "gtest/gtest.h"
#include "hld_alloc.h"
#include "AcclApp.h"
#include "Config.h"
TEST(AccelTest, SimpleTest) {
AcclApp theApp;
unsigned int n_cls = 1024;
unsigned long long sz = 2ULL*n_cls*64;
if ( theApp.alloc( sz)) {
unsigned char *WORKSPACE = theApp.m_JointVirt;
size_t WORKSPACE_SIZE = theApp.m_JointSize;
Config config;
...
We need to change one thing to get this to work with our hardware (SystemC) accelerator implementation.
Instead of including the pure software header AcclApp.h
we include the definition of class AcclApp
from memcpy_hls_tb.h
(which was automatically generated.)
If we surround this with a compile time conditional, we can share the software driver code between the pure-software and SystemC models:
#ifdef KERNEL_TEST
#include "memcpy_hls_tb.h"
#else
#include "AcclApp.h"
#endif
We need a simple Makefile
.
Here is one that include a standard file, setting a few options before including the common make directives:
DEBUG_FLAGS=-O2 -g
CFLAGS += -DKERNEL_TEST
HLD_ROOT = ../..
SOURCES=tb.cpp
TARGET=accel_test
CXX=g++
include $(HLD_ROOT)/common/Makefile.inc
We set the source file tb.cpp
, and the executable name accel_test
.
We turn on the KERNEL_TEST
preprocessor constant, so that our SystemC code will be linked in.
Running make
yields:
g++ -O2 -g -DKERNEL_TEST -std=c++11 -Wall -I../../common -I../../accio -I../../acctempl -Wno-virtual-move-assign -I/p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/include -Wno-unused-label -I/nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/include -o tb.o -c tb.cpp
g++ -MM -DKERNEL_TEST -std=c++11 -Wall -I../../common -I../../accio -I../../acctempl -Wno-virtual-move-assign -I/p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/include -Wno-unused-label -I/nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/include tb.cpp > tb.d
g++ -O2 -g -pthread -o accel_test tb.o /p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/lib-linux64/libsystemc.a /nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/make/gtest_main.a
Running the compiled executable results in:
[COG_ENV_DIR] dlxc1340> ./accel_test
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from AccelTest
[ RUN ] AccelTest.SimpleTest
Info: (I804) /IEEE_Std_1666/deprecated: interface and/or port binding in port constructors is deprecated
HW compute..
Results checked. 524288 of 524288 correct.
[ OK ] AccelTest.SimpleTest (2695 ms)
[----------] 1 test from AccelTest (2695 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (2695 ms total)
[ PASSED ] 1 test.