My favorites | Sign in
Project Home Downloads Wiki Issues Source
Search
for
Examples  
Examples.
Featured
Updated Aug 14, 2010 by kobalicek.petr

Assembler example

Small example how to write dynamic x86 function in C++ (from AsmJit examples):

AsmJit-0.8

// Create simple DWORD memory copy function for 32 bit x86 platform:
// (for AsmJit version 0.8+)
//
// void memcpy32(UInt32* dst, const UInt32* src, SysUInt len);

// AsmJit library
#include <AsmJit/Assembler.h>
#include <AsmJit/MemoryManager.h>

// C library - printf
#include <stdio.h>

// It isn't needed to include namespace here, but when generating assembly it's
// easier to use directly eax, rax, ... instead of AsmJit::eax, AsmJit::rax, ...
using namespace AsmJit;

// This is type of function we will generate .
typedef void (*MemCpy32Fn)(UInt32*, const UInt32*, SysUInt);

int main(int argc, char* argv[])
{
  // ==========================================================================
  // Part 1:

  // Create Assembler.
  Assembler a;
  
  // Arguments offset: 4 (first argument) + 4 (push ebp instructions).
  const int arg_offset = 4 + 4;
  
  // Labels.
  Label L_Loop;
  Label L_Exit;
  
  // Prolog.
  a.push(ebp);
  a.mov(ebp, esp);
  a.push(esi);
  a.push(edi);
  
  // Fetch arguments (arguments were passed in right-to-left direction, we must read
  // them in left-to-right direction).
  a.mov(edi, dword_ptr(ebp, arg_offset + 0)); // get dst
  a.mov(esi, dword_ptr(ebp, arg_offset + 4)); // get src
  a.mov(ecx, dword_ptr(ebp, arg_offset + 8)); // get len

  // Exit if length is zero.
  a.test(ecx, ecx);
  a.jz(&L_Exit);
  
  // Bind L_Loop label to here.
  a.bind(&L_Loop);
  
  a.mov(eax, dword_ptr(esi));
  a.mov(dword_ptr(edi), eax);
  
  a.add(esi, 4);
  a.add(edi, 4);
  
  // Loop until ecx is not zero.
  a.dec(ecx);
  a.jnz(&L_Loop);
  
  // Exit.
  a.bind(&L_Exit);
  
  // Epilog.
  a.pop(edi);
  a.pop(esi);
  a.mov(esp, ebp);
  a.pop(ebp);

  // Return.
  a.ret();
  // ==========================================================================

  // ==========================================================================
  // Part 2:

  // Make JIT function.
  MemCpy32Fn fn = function_cast<MemCpy32Fn>(a.make());

  // Ensure that everything is ok.
  if (!fn)
  {
    printf("Error making jit function (%u).\n", a.error());
    return 1;
  }

  // Create some data.
  UInt32 dst[128];
  UInt32 src[128];
  
  // Call JIT function.
  fn(dst, src, 128);
  
  // If you don't need the function anymore, it should be freed. Default memory
  // manager used by make() is instance returned by MemoryManager::global().
  MemoryManager::global()->free((void*)fn);
  // ==========================================================================

  return 0;
}

AsmJit-1.0

// Create simple DWORD memory copy function for 32 bit x86 platform:
// (for AsmJit version 0.8+)
//
// void memcpy32(uint32_t* dst, const uint32_t* src, sysuint_t len);

// AsmJit library
#include <AsmJit/Assembler.h>
#include <AsmJit/MemoryManager.h>

// C library - printf
#include <stdio.h>

// It isn't needed to include namespace here, but when generating assembly it's
// easier to use directly eax, rax, ... instead of AsmJit::eax, AsmJit::rax, ...
using namespace AsmJit;

// This is type of function we will generate .
typedef void (*MemCpy32Fn)(uint32_t*, const uint32_t*, sysuint_t);

int main(int argc, char* argv[])
{
  // ==========================================================================
  // Part 1:

  // Create Assembler.
  Assembler a;
  
  // Arguments offset: 4 (first argument) + 4 (push ebp instructions).
  const int arg_offset = 4 + 4;
  
  // Labels.
  Label L_Loop = a.newLabel();
  Label L_Exit = a.newLabel();
  
  // Prolog.
  a.push(ebp);
  a.mov(ebp, esp);
  a.push(esi);
  a.push(edi);
  
  // Fetch arguments (arguments were passed in right-to-left direction, we must read
  // them in left-to-right direction).
  a.mov(edi, dword_ptr(ebp, arg_offset + 0)); // get dst
  a.mov(esi, dword_ptr(ebp, arg_offset + 4)); // get src
  a.mov(ecx, dword_ptr(ebp, arg_offset + 8)); // get len

  // Exit if length is zero.
  a.test(ecx, ecx);
  a.jz(L_Exit);
  
  // Bind L_Loop label to here.
  a.bind(L_Loop);
  
  a.mov(eax, dword_ptr(esi));
  a.mov(dword_ptr(edi), eax);
  
  a.add(esi, 4);
  a.add(edi, 4);
  
  // Loop until ecx is not zero.
  a.dec(ecx);
  a.jnz(L_Loop);
  
  // Exit.
  a.bind(L_Exit);
  
  // Epilog.
  a.pop(edi);
  a.pop(esi);
  a.mov(esp, ebp);
  a.pop(ebp);

  // Return.
  a.ret();
  // ==========================================================================

  // ==========================================================================
  // Part 2:

  // Make JIT function.
  MemCpy32Fn fn = function_cast<MemCpy32Fn>(a.make());

  // Ensure that everything is ok.
  if (!fn)
  {
    printf("Error making jit function (%u).\n", a.getError());
    return 1;
  }

  // Create some data.
  uint32_t dst[128];
  uint32_t src[128];
  
  // Call JIT function.
  fn(dst, src, 128);
  
  // If you don't need the function anymore, it should be freed. Default memory
  // manager used by make() is instance returned by MemoryManager::global().
  MemoryManager::getGlobal()->free((void*)fn);
  // ==========================================================================

  return 0;
}

This code will generate binary code for simple memcpy32 function (memory copy where one unit is DWORD). In part 1 the Assembler class is used to generate code stream. You can see prolog, fetching arguments, function body and epilog. Function prolog and epilog depends to calling convention of the generated function.

Part of AsmJit library is cross-platform code to allocate memory where generated jit functions can be executed. Recommended way is to use make() method that will use MemoryManager to alloc memory and relocates code to that buffer. Code is always generated to normal memory first and when generation is finished it's relocated to execution-enabled memory. If you not specify MemoryManager instance to make(), global memory manager is used - the instance that can be get by MemoryManager::global().

There are also several different ways how to enable code execution in memory manually. Firstly, it's needed to know that this type of memory is allocated by different functions than standard memory. Under Windows it's VirtualAlloc() function and under POSIX operating systems (such as Linux, BSD, MacOS) mmap(). There is also function mprotect() that can enable code execution in memory allocated by malloc(). Asmjit not uses this way, because it enables whole page (that is usually 4KB).

AsmJit memory management model was improved and now it's recommended way how to allocate and free JIT functions. If you still need to allocate memory for your code manually, you can use VirtualMemory class that is crossplatform and allocates chunks of virtual memory.

Compiler example

In previous example you can see that generated assembler is only for 32 bit x86 processor and CDECL function calling convention. It's not big problem to change calling convention to STDCALL or FASTCALL, but it can't be done without modifying assembler code and this modification can introduce new bugs in your code. Because x64 (64 bit) calling conventions differs from Windows and other operating systems so writing functions for 64 bit Windows and Linux/BSDs a bit harder that for 32-bit x86 platforms, the Compiler class was created.

Compiler is optional (you can live without it) component that can do all dirty stuff when generating functions for you. It contains function builder, registers allocator and variables manager. It can also reserve stack for local variables that can be automatically spiled or restored from it. Compiler can also write functions without prolog/epilog code and this can give you one extra register (EBP/RBP). You can play with various settings and see what's happened.

Here is portable code that's using Compiler to create simple DWORD memory copy function:

AsmJit-0.8

// Create simple DWORD memory copy function for 32/64 bit x86 platform, this
// is enchanced version that's using Compiler class:
//
// void memcpy32(UInt32* dst, const UInt32* src, SysUInt len);

// AsmJit library
#include <AsmJit/Assembler.h>
#include <AsmJit/Compiler.h>
#include <AsmJit/MemoryManager.h>

// C library - printf
#include <stdio.h>

// It isn't needed to include namespace here, but when generating assembly it's
// easier to use directly eax, rax, ... instead of AsmJit::eax, AsmJit::rax, ...
using namespace AsmJit;

// This is type of function we will generate .
typedef void (*MemCpy32Fn)(UInt32*, const UInt32*, SysUInt);

int main(int argc, char* argv[])
{
  // ==========================================================================
  // Part 1:

  // Create Compiler.
  Compiler c;

  // Tell compiler the function prototype we want. It allocates variables representing
  // function arguments that can be accessed through Compiler or Function instance.
  Function& f = * c.newFunction(CALL_CONV_DEFAULT, BuildFunction3<UInt32*, const UInt32*, SysUInt>());

  // Try to generate function without prolog / epilog code:
  f.setNaked(true);

  // Other choices:
  // - f.setPrologEpilogPushPop(bool)
  //   (whether prolog / epilog should be generated using push/pop or mov instructions)
  // - f.setAllocableEbp(bool)
  //   (enable ebp to by allocated by register allocator)

  // Labels must be allocated by compiler (this is different to using Assembler class).
  Label* L_Loop = c.newLabel();
  Label* L_Exit = c.newLabel();

  // Function arguments.
  PtrRef dst(c.argument(0));
  PtrRef src(c.argument(1));
  SysIntRef cnt(c.argument(2));
   
  // Allocate variables:
  // - first argument is variable type
  // - second argument is variable priority:
  //     - lower means better
  //     - 0 means register only
  //       (so variable can't be spilled and spill means ASSERTion failure)
  Int32Ref tmp(c.newVariable(VARIABLE_TYPE_INT32, 0));
  tmp.alloc();

  // Allocate registers (if they are not allocated)
  dst.alloc();
  src.alloc();
  cnt.alloc();

  // Exit if length is zero.
  c.test(cnt.r(), cnt.r());
  c.jz(L_Exit);

  // Loop.
  c.bind(L_Loop);

  // Copy DWORD (4 bytes).
  c.mov(tmp.r(), dword_ptr(src.r()));
  c.mov(dword_ptr(dst.r()), tmp.r());

  // Increment pointers.
  c.add(src.r(), 4);
  c.add(dst.r(), 4);

  // Loop until cnt is not zero.
  c.dec(cnt.r());
  c.jnz(L_Loop);

  // Exit.
  c.bind(L_Exit);

  // Finish.
  c.endFunction();
  // ==========================================================================

  // ==========================================================================
  // Part 2:

  // Make JIT function.
  MemCpy32Fn fn = function_cast<MemCpy32Fn>(c.make());

  // Ensure that everything is ok.
  if (!fn)
  {
    printf("Error making jit function (%u).\n", c.error());
    return 1;
  }

  // Create some data.
  UInt32 dstBuffer[128];
  UInt32 srcBuffer[128];
  
  // Call the JIT function.
  fn(dstBuffer, srcBuffer, 128);
  
  // Free the JIT function if it's not needed anymore.
  MemoryManager::global()->free((void*)fn);
  // ==========================================================================

  return 0;
}

AsmJit-1.0

// Create simple DWORD memory copy function for 32/64 bit x86 platform, this
// is enchanced version that's using Compiler class:
//
// void memcpy32(uint32_t* dst, const uint32_t* src, sysuint_t len);

// AsmJit library
#include <AsmJit/AsmJit.h>

// C library - printf
#include <stdio.h>

// It isn't needed to include namespace here, but when generating assembly it's
// easier to use directly eax, rax, ... instead of AsmJit::eax, AsmJit::rax, ...
using namespace AsmJit;

// This is type of function we will generate.
typedef void (*MemCpy32Fn)(uint32_t*, const uint32_t*, sysuint_t);

int main(int argc, char* argv[])
{
  // ==========================================================================
  // Part 1:

  // Create Compiler.
  Compiler c;

  // Tell compiler the function prototype we want. It allocates variables representing
  // function arguments that can be accessed through Compiler or Function instance.
  c.newFunction(CALL_CONV_DEFAULT, FunctionBuilder3<Void, uint32_t*, const uint32_t*, sysuint_t>());

  // Try to generate function without prolog/epilog code:
  c.getFunction()->setHint(FUNCTION_HINT_NAKED, true);

  // Create labels.
  Label L_Loop = c.newLabel();
  Label L_Exit = c.newLabel();

  // Function arguments.
  GPVar dst(c.argGP(0));
  GPVar src(c.argGP(1));
  GPVar cnt(c.argGP(2));

  // Allocate loop variables registers (if they are not allocated already).
  c.alloc(dst);
  c.alloc(src);
  c.alloc(cnt);

  // Exit if length is zero.
  c.test(cnt, cnt);
  c.jz(L_Exit);

  // Loop.
  c.bind(L_Loop);

  // Copy DWORD (4 bytes).
  GPVar tmp(c.newGP(VARIABLE_TYPE_GPD));
  c.mov(tmp, dword_ptr(src));
  c.mov(dword_ptr(dst), tmp);

  // Increment dst/src pointers.
  c.add(src, 4);
  c.add(dst, 4);

  // Loop until cnt is not zero.
  c.dec(cnt);
  c.jnz(L_Loop);

  // Exit.
  c.bind(L_Exit);

  // Finish.
  c.endFunction();
  // ==========================================================================

  // ==========================================================================
  // Part 2:

  // Make JIT function.
  MemCpy32Fn fn = function_cast<MemCpy32Fn>(c.make());

  // Ensure that everything is ok.
  if (!fn)
  {
    printf("Error making jit function (%u).\n", c.getError());
    return 1;
  }

  // Create some data.
  uint32_t dstBuffer[128];
  uint32_t srcBuffer[128];
  
  // Call the JIT function.
  fn(dstBuffer, srcBuffer, 128);
  
  // Free the JIT function if it's not needed anymore.
  MemoryManager::getGlobal()->free((void*)fn);
  // ==========================================================================

  return 0;
}

You can see that code generation is straightforward. Because AsmJit is based on very extensible mechanism, Compiler uses same code base for intrinsics as Assembler (in AsmJit this is called Serializing). But instead direct code emitting it stores all instructions in internal representation and emits everything when code is completed. This allows possibility to many optimizations and helps with generating function prologs / epilogs.

Another adventage is that code generated through Compiler will run on 32-bit and 64-bit platforms. Generated code will use calling convention that uses your operating system and C++ compiler (you can of course set different calling convention, for example you can use AsmJit library to generate function for 64-bit Windows through 64-bit Wine).

NOTE 1: You can see that some instances used in first example are here created through Compiler. Because Compiler is more complex class than pure Assembler it's needed to accept its rules. Everything is of course documented.

NOTE 2: AsmJit-1.0 code is provided for comparisons. There are some differences and this page will be updated after AsmJit-1.0 release. Main differences are standardized label syntax for Assembler/Compiler and much simpler syntax when using variables.


Sign in to add a comment
Powered by Google Project Hosting