Skip to content
This repository was archived by the owner on Jan 7, 2023. It is now read-only.

Kernel Development

Andrey Ayupov edited this page Mar 23, 2017 · 3 revisions

Modifying the Initial SystemC Kernel Code

This code is automatically generated:

/*[[[cog
     import cog
     from cog_acctempl import *
     from dut_params import *
  ]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
/*[[[cog
     cog.outl("#ifndef __%s_HLS_H__" % dut.nm.upper())
     cog.outl("#define __%s_HLS_H__" % dut.nm.upper())
  ]]]*/
#ifndef __MEMCPY_HLS_H__
#define __MEMCPY_HLS_H__
//[[[end]]] (checksum: f5d0a4d0e706ab8b40e85b21f0328629)

#ifndef __SYNTHESIS__
#include <type_traits>
#endif

#include "systemc.h"

#include "types.h"

#include "Config.h"

#include "ga_tlm_fifo.h"

#include "hls_utils.h"

/*[[[cog
     for m in dut.modules.values():
       cog.outl("#include \"%s.h\"" % m.nm)
  ]]]*/
//[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)

/*[[[cog
     cog.outl("class %s_hls : public sc_module" % dut.nm)
  ]]]*/
class memcpy_hls : public sc_module
//[[[end]]] (checksum: fa2560ce13aa08d8a581d079b39bf215)
{

public:
  sc_in_clk clk;
  sc_in<bool> rst;

  // functional ports
  sc_in<Config> config;
  sc_in<bool> start;
  sc_out<bool> done;

  // memory ports
  /*[[[cog
       for p in dut.inps:
         cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(RdReqPort(p.nm)),p.reqTy(),p.reqNmK()))
         cog.outl("ga::tlm_fifo_%sin<%s > %s;" % (dut.isHier(RdRespPort(p.nm)),p.respTy(),p.respNmK()))
         cog.outl("")
       for p in dut.outs:
         cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(WrReqPort(p.nm)),p.reqTy(),p.reqNmK()))
         cog.outl("ga::tlm_fifo_%sout<%s > %s;" % (dut.isHier(WrDataPort(p.nm)),p.dataTy(),p.dataNmK()))
         cog.outl("")
    ]]]*/
  ga::tlm_fifo_out<MemTypedReadReqType<CacheLine> > inpReqOut;
  ga::tlm_fifo_in<MemTypedReadRespType<CacheLine> > inpRespIn;

  ga::tlm_fifo_out<MemTypedWriteReqType<CacheLine> > outReqOut;
  ga::tlm_fifo_out<MemTypedWriteDataType<CacheLine> > outDataOut;

  //[[[end]]] (checksum: fd9fde2da603878fca9336358740669c)
  // Instantiate modules
  /*[[[cog
       for m in dut.modules.values():
         cog.outl("%s %s_inst;" % (m.nm, m.nm))
    ]]]*/
  //[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)

  // TLM fifos (between modules)
  /*[[[cog
       for f in dut.tlm_fifos:
         cog.outl("ga::tlm_fifo<%s, %d> %s;" % (f.ty, f.capacity, f.nm))
    ]]]*/
  //[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)

  // storage fifos (between threads in this module)
  /*[[[cog
       for f in dut.storage_fifos:
         if dut.find_parent(dut.put_tbl[f.nm]).nm == dut.module.nm:
            cog.outl("ga::ga_storage_fifo<%s, %d> %s;" % (f.ty, f.capacity, f.nm))
    ]]]*/
  //[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)

  /*[[[cog
       cog.outl("SC_HAS_PROCESS(%s_hls);" % (dut.nm,))
    ]]]*/
  SC_HAS_PROCESS(memcpy_hls);
  //[[[end]]] (checksum: 1639f171cef6dba259fd16954ea97228)

  /*[[[cog
       cog.outl("%s_hls(sc_module_name modname) :" % (dut.nm,))
    ]]]*/
  memcpy_hls(sc_module_name modname) :
  //[[[end]]] (checksum: 18fc9bafe7cadd2e6ff44e18aab1421e)
         sc_module(modname)
       , clk("clk"), rst("rst")
       , config("config"), start("start"), done("done")
       /*[[[cog
            for p in dut.inps:
              cog.outl(", {0}(\"{0}\")".format( p.reqNmK()))
              cog.outl(", {0}(\"{0}\")".format( p.respNmK()))
            for p in dut.outs:
              cog.outl(", {0}(\"{0}\")".format( p.reqNmK()))
              cog.outl(", {0}(\"{0}\")".format( p.dataNmK()))
            for m in dut.modules.values():
              cog.outl(", {0}(\"{0}\")".format( m.nm + "_inst"))
         ]]]*/
       , inpReqOut("inpReqOut")
       , inpRespIn("inpRespIn")
       , outReqOut("outReqOut")
       , outDataOut("outDataOut")
       //[[[end]]] (checksum: 2881ddb8430a98cd05d5dae52488ecb0)
  {
    /*[[[cog
         for c in dut.cthreads.values():
           cog.outl("SC_CTHREAD(%s, clk.pos());" % (c.nm,))
           cog.outl("async_reset_signal_is(rst, false);")
           cog.outl("")
      ]]]*/
    SC_CTHREAD(inp_fetcher, clk.pos());
    async_reset_signal_is(rst, false);

    SC_CTHREAD(inp_addr_gen, clk.pos());
    async_reset_signal_is(rst, false);

    SC_CTHREAD(out_addr_gen, clk.pos());
    async_reset_signal_is(rst, false);

    //[[[end]]] (checksum: 773ff4522c18e0df27ce11fb3e624133)
    /*[[[cog
         for p in dut.inps:
           if "" == dut.isHier(RdReqPort(p.nm)):
              cog.outl("%s.clk_rst(clk, rst);" % (p.reqNmK(),))
           if "" == dut.isHier(RdRespPort(p.nm)):
              cog.outl("%s.clk_rst(clk, rst);" % (p.respNmK(),))
         for p in dut.outs:
           if "" == dut.isHier(WrReqPort(p.nm)):
              cog.outl("%s.clk_rst(clk, rst);" % (p.reqNmK(),))
           if "" == dut.isHier(WrDataPort(p.nm)):
              cog.outl("%s.clk_rst(clk, rst);" % (p.dataNmK(),))
      ]]]*/
    inpReqOut.clk_rst(clk, rst);
    inpRespIn.clk_rst(clk, rst);
    outReqOut.clk_rst(clk, rst);
    outDataOut.clk_rst(clk, rst);
    //[[[end]]] (checksum: 0436c52fea41c23b502c1f072eb76632)

#ifndef USE_HLS
    /*[[[cog
         for f in dut.storage_fifos:
           if dut.find_parent(dut.put_tbl[f.nm]).nm == dut.module.nm:
             cog.outl("%s.clk_rst(clk, rst);" % f.nm)
      ]]]*/
    //[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)
#endif

    /*[[[cog
         for m in dut.modules.values():
            for sig in ["clk","rst","start","config"]:
               cog.outl("{0}_inst.{1}({1});".format( m.nm, sig))
            if m.writes_to_done:
               cog.outl("{0}_inst.{1}({1});".format( m.nm, "done"))
            for p in dut.inps:
               if m.portOf( RdReqPort(p.nm)):
                 cog.outl("{0}_inst.{1}({1});".format( m.nm, p.reqNmK()))
               if m.portOf( RdRespPort(p.nm)):
                 cog.outl("{0}_inst.{1}({1});".format( m.nm, p.respNmK()))
            for p in dut.outs:
               if m.portOf( WrReqPort(p.nm)):
                 cog.outl("{0}_inst.{1}({1});".format( m.nm, p.reqNmK()))
               if m.portOf( WrDataPort(p.nm)):
                 cog.outl("{0}_inst.{1}({1});".format( m.nm, p.dataNmK()))
            for f in dut.tlm_fifos:
               cog.outl("{0}_inst.{1}({1});".format( m.nm, f.nm))
      ]]]*/
    //[[[end]]] (checksum: d41d8cd98f00b204e9800998ecf8427e)

  }

/*[[[cog
     for c in dut.cthreads.values():
       cog.outl("#include \"%s-%s.h\"" % (dut.nm,c.nm))
  ]]]*/
#include "memcpy-inp_fetcher.h"
#include "memcpy-inp_addr_gen.h"
#include "memcpy-out_addr_gen.h"
//[[[end]]] (checksum: 484e6e29ad0d99d170f7f9de8f0c3cb6)

};

/*[[[cog
     cog.outl("#include \"%s_acc.h\"" % (dut.nm,))
  ]]]*/
#include "memcpy_acc.h"
//[[[end]]] (checksum: e868954c0cd9f92182731fd98ad0951c)

#ifndef NUM_AUS
#define NUM_AUS 1
#endif

#ifndef RD_CHANNELS
#define RD_CHANNELS NUM_AUS
#endif

#ifndef WR_CHANNELS
#define WR_CHANNELS NUM_AUS
#endif

#if NUM_AUS == 1
/*[[[cog
     cog.outl("typedef %s_acc dut_t;" % (dut.nm,))
  ]]]*/
typedef memcpy_acc dut_t;
//[[[end]]] (checksum: 0ce887778cbdeebc316aa28c2a590876)
#else
/*[[[cog
     cog.outl("#include \"%s_sched.h\"" % dut.nm)
     cog.outl("")
     cog.outl("typedef multi_acc_template_Np<NUM_AUS, %s_sched<NUM_AUS>, %s_acc, Config, RD_CHANNELS, WR_CHANNELS> dut_t;" % (dut.nm,dut.nm))
  ]]]*/
#include "memcpy_sched.h"

typedef multi_acc_template_Np<NUM_AUS, memcpy_sched<NUM_AUS>, memcpy_acc, Config, RD_CHANNELS, WR_CHANNELS> dut_t;
//[[[end]]] (checksum: a810fe1d315e268751e5f964942c0ebb)
#endif

#endif

We only need to add a small amount of custom code to make the accelerator perform the copy operations. Because we added the basic communication structure for the three cthreads to the dut_params.py file, most of the SystemC boilerplate has already been created for us. For example, the SC_CTHREAD and async_reset_signal_is statements are already filled in.

    SC_CTHREAD(inp_fetcher, clk.pos());
    async_reset_signal_is(rst, false);

    SC_CTHREAD(inp_addr_gen, clk.pos());
    async_reset_signal_is(rst, false);

    SC_CTHREAD(out_addr_gen, clk.pos());
    async_reset_signal_is(rst, false);

The code we need to add will be in the three thread specific header files: memcpy-inp_fetcher.h, memcpy-inp_addr_gen.h, and memcpy-out_addr_gen.h. We'll work on the first one---the thread that fetches responses from the inp port of the memory system and then sends this data to the out port of the memory system. The template system creates a starting implementation as follows:

/*[[[cog
     import cog
     from cog_acctempl import *
     from dut_params import dut
     if "thread_nm" not in globals():
       lst = cog.previous.lstrip('/').rstrip('\n').split('=')
       assert( lst[0]=="thread_nm")
       assert( len(lst)==2)
       global thread_nm
       thread_nm = lst[1]
     cog.outl( "//thread_nm=" + thread_nm)
  ]]]*/
//thread_nm=inp_fetcher
//[[[end]]] (checksum: b3cc55cdf47df8e2833f7683e251c994)

// Declare helper methods and class variables

/*[[[cog
     c = dut.get_cthread(thread_nm)
     cog.outl("void %s() {" % (c.nm,))
     for p in c.ports:
       cog.outl("  %s;" % p.reset)
  ]]]*/
void inp_fetcher() {
  inpRespIn.reset_get();
  outDataOut.reset_put();
//[[[end]]] (checksum: 78e3954429f1b9eaa6d398a4359e0c54)

  // Declare and initialize local variables

  /*[[[cog
       if c.writes_to_done:
         cog.outl("done = false;")
    ]]]*/
  done = false;
  //[[[end]]] (checksum: 872c87e47f55883a9563054415885245)
  wait();
  while (1) {
    if ( start) {

      // Fill in here

    }
    wait();
  }
}

There is some complex cog code at the top of the file that is used to remember the thread name in case you want to regenerate the interface after a change to the dut_params.py file. The interesting part starts with the automatically generated reset of the two memory ports, inpRespIn and outDataOut. These ports were assigned to this thread and that is why the reset code was generated. We can now add the code needed to preform our simple computation. We need to count the number of cache lines that have been processed so we know when we are done. So we declare the local variable ip and initialize it to zero.

    inpRespIn.reset_get();
    outDataOut.reset_put();

    unsigned int ip = 0;

    done = false;
    wait();

The assignment of false to done has also generated because the thread had the property writes_to_done specified in its dut_params.py description. This completes the reset block of the thread.

In the main execution loop, we check to see if our loop counter has reached its final value, and if not, we perform a get operation on the inpRespIn port followed by a put operation on the outDataOut port. Otherwise, we raise the done flag.

    while (1) {
      if ( start) {
        if ( ip != config.read().get_nCLs()) {
          MemTypedReadRespType<CacheLine> wrapped_cl = inpRespIn.get();
          outDataOut.put(MemTypedWriteDataType<CacheLine>( wrapped_cl.data));
          ++ip;
        } else {
          done = true;
        }
      }
      wait();
    }

The start, done, and config variables are sc_signals specified in the (generated) interface of the model. See the Memory API page for more details on the memory system including the different types that are available. See the HLS IOLib page for information on how to perform blocking and non-blocking put and get communications.

Here is the final code for this thread:

  void inp_fetcher() {
    inpRespIn.reset_get();
    outDataOut.reset_put();

    unsigned int ip = 0;

    done = false;
    wait();

    while (1) {
      if ( start) {
        if ( ip != config.read().get_nCLs()) {
          MemTypedReadRespType<CacheLine> wrapped_cl = inpRespIn.get();
          outDataOut.put(MemTypedWriteDataType<CacheLine>( wrapped_cl.data));
          ++ip;
        } else {
          done = true;
        }
      }
      wait();
    }
  }

We can also quickly code up the inp address request thread. Here we want to put a single request to inpReqOut with the starting address and number of cachelines.

  void inp_addr_gen() {
    inpReqOut.reset_put();
    bool already_sent = false;
    wait();

    while (1) {
      if ( start) {
        if ( !already_sent) {
          inpReqOut.put(MemTypedReadReqType<CacheLine>( config.read().getInpAddr( 0), config.read().get_nCLs()));
          already_sent = true;
        }         
      }
      wait();
    }
  }

The state variable already_sent is used to ensure the request is only sent once.

Similar code is used for the out address request thread.

  void out_addr_gen() {
    outReqOut.reset_put();
    bool already_sent = false;
    wait();

    while (1) {
      if ( start) {
        if ( !already_sent) {
          outReqOut.put(MemTypedWriteReqType<CacheLine>( config.read().getOutAddr( 0), config.read().get_nCLs()));
          already_sent = true;
        }         
      }
      wait();
    }
  }

Running your first SystemC Kernel Code Simulation

We can now link the software driver code from our earlier page together with our kernel implementation. Here are the first few lines of the driver code again:

#include "gtest/gtest.h"
#include "hld_alloc.h"

#include "AcclApp.h"
#include "Config.h"

TEST(AccelTest, SimpleTest) {

  AcclApp theApp;

  unsigned int n_cls = 1024;

  unsigned long long sz = 2ULL*n_cls*64;

  if ( theApp.alloc( sz)) {
    unsigned char *WORKSPACE = theApp.m_JointVirt;
    size_t WORKSPACE_SIZE    = theApp.m_JointSize;

    Config config;
...

We need to change one thing to get this to work with our hardware (SystemC) accelerator implementation. Instead of including the pure software header AcclApp.h we include the definition of class AcclApp from memcpy_hls_tb.h (which was automatically generated.) If we surround this with a compile time conditional, we can share the software driver code between the pure-software and SystemC models:

#ifdef KERNEL_TEST
#include "memcpy_hls_tb.h"
#else
#include "AcclApp.h"
#endif

We need a simple Makefile. Here is one that include a standard file, setting a few options before including the common make directives:

DEBUG_FLAGS=-O2 -g
CFLAGS += -DKERNEL_TEST

HLD_ROOT = ../..
SOURCES=tb.cpp
TARGET=accel_test

CXX=g++

include $(HLD_ROOT)/common/Makefile.inc

We set the source file tb.cpp, and the executable name accel_test. We turn on the KERNEL_TEST preprocessor constant, so that our SystemC code will be linked in. Running make yields:

g++ -O2 -g -DKERNEL_TEST -std=c++11 -Wall -I../../common -I../../accio -I../../acctempl -Wno-virtual-move-assign -I/p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/include -Wno-unused-label -I/nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/include  -o tb.o -c tb.cpp
g++ -MM -DKERNEL_TEST -std=c++11 -Wall -I../../common -I../../accio -I../../acctempl -Wno-virtual-move-assign -I/p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/include -Wno-unused-label -I/nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/include  tb.cpp > tb.d
g++ -O2 -g -pthread -o accel_test tb.o /p/hdk/rtl/cad/x86-64_linux26/accellera/systemc/systemc-2.3.0/lib-linux64/libsystemc.a /nfs/site/disks/scl.work.58/ppt/aayupov/gtest/googletest/googletest/make/gtest_main.a

Running the compiled executable results in:

[COG_ENV_DIR] dlxc1340> ./accel_test 
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from AccelTest
[ RUN      ] AccelTest.SimpleTest

Info: (I804) /IEEE_Std_1666/deprecated: interface and/or port binding in port constructors is deprecated
HW compute..
Results checked. 524288 of 524288 correct.
[       OK ] AccelTest.SimpleTest (2695 ms)
[----------] 1 test from AccelTest (2695 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (2695 ms total)
[  PASSED  ] 1 test.
Clone this wiki locally