commit aaddcc0cbd10c9e7de46c6ad1ad6c4a20a9eaf17
Author: niansa <tuxifan@posteo.de>
Date:   Thu Mar 30 07:03:33 2023 -0500

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4a0b530
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,74 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+CMakeLists.txt.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..0477fdd
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..858d069
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(libjustlm LANGUAGES C CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_library(libjustlm STATIC
+  libjustlm_core.cpp
+  justlm.hpp
+)
+
+set(LM_BACKEND "llama.cpp" CACHE STRING "The language model backend to use")
+
+if (LM_BACKEND STREQUAL "libnc gpt2")
+    add_library(libjustlm_gpt2 STATIC libjustlm_gpt2.cpp gpt2/arith.c gpt2/cp_utils.c gpt2/gpt2tc.c)
+    target_link_libraries(libjustlm_gpt2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gpt2/libnc.so pthread)
+elseif (LM_BACKEND STREQUAL "llama.cpp")
+    add_subdirectory(llama.cpp)
+    add_library(libjustlm_llama STATIC libjustlm_llama.cpp)
+    target_link_libraries(libjustlm_llama PRIVATE llama)
+else()
+    message(FATAL_ERROR "LM_BACKEND '${LM_BACKEND}' is unsupported. Please use either 'libnc gpt2' or 'llama.cpp'.")
+endif()
diff --git a/gpt2/VERSION b/gpt2/VERSION
new file mode 100644
index 0000000..da4ce28
--- /dev/null
+++ b/gpt2/VERSION
@@ -0,0 +1 @@
+2021-04-24
diff --git a/gpt2/arith.c b/gpt2/arith.c
new file mode 100644
index 0000000..79d0148
--- /dev/null
+++ b/gpt2/arith.c
@@ -0,0 +1,301 @@
+/*
+ * Arithmetic coder
+ * 
+ * Copyright (c) 2018-2021 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <time.h>
+#include <getopt.h>
+
+#include "cutils.h"
+#include "arith.h"
+
+#define RANGE_MIN_BITS 16
+#define RANGE_MIN ((0xff << (RANGE_MIN_BITS - 8)) + 1)
+#define RANGE_MAX (0xff << RANGE_MIN_BITS)
+
+//#define DUMP_PUT_BIT
+//#define DUMP_GET_BIT
+
+void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
+                  PutBitWriteFunc *write_func, void *opaque)
+{
+    s->low = 0;
+    s->range = RANGE_MAX;
+    s->current_byte = 0xff;
+    s->n_bytes = 0;
+    s->buf = buf;
+    s->buf_size = buf_size;
+    s->idx = 0;
+    s->write_func = write_func;
+    s->opaque = opaque;
+    s->byte_count = 0;
+    assert(PROB_UNIT <= RANGE_MIN);
+}
+
+static void put_byte(PutBitState *s, int v)
+{
+    s->buf[s->idx++] = v;
+    if (unlikely(s->idx == s->buf_size)) {
+        s->byte_count += s->idx;
+        s->write_func(s->opaque, s->buf, s->idx);
+        s->idx = 0;
+    }
+}
+
+/* 0 <= v <= 0x1fe. The current output stream contains n_bytes with:
+   current_byte, then (n_bytes - 1) x 0xff
+ */
+static void put_val(PutBitState *s, int v)
+{
+    uint32_t carry, b;
+
+#ifdef DUMP_PUT_BIT
+    printf("  out=%d\n", v);
+#endif
+    if (v == 0xff) {
+        s->n_bytes++;
+    } else {
+        if (s->n_bytes > 0) {
+            carry = v >> 8;
+            put_byte(s, s->current_byte + carry);
+            b = (0xff + carry) & 0xff;
+            while (s->n_bytes > 1) {
+                put_byte(s, b);
+                s->n_bytes--;
+            }
+        }
+        s->n_bytes = 1;
+        s->current_byte = v;
+    }
+}
+
+static void put_val_flush(PutBitState *s)
+{
+    if (s->n_bytes > 0) {
+        put_val(s, 0);
+    }
+}
+
+static void put_bit_renorm(PutBitState *s)
+{
+    uint32_t v;
+    /* after renormalisation:
+       0 <= low <= RANGE_MAX
+       RANGE_MIN <= range <= RANGE_MAX
+       In the worst case before normalisation:
+       low_max = 2 * RANGE_MAX hence v <= 0x1fe
+    */
+    while (s->range < RANGE_MIN) {
+        v = s->low >> RANGE_MIN_BITS;
+        put_val(s, v);
+        s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
+        s->range <<= 8;
+    }
+}
+
+/* 0 < prob0 < PROB_UNIT */
+void put_bit(PutBitState *s, int prob0, int bit)
+{
+    int range0;
+
+    assert(s->range >= RANGE_MIN);
+    range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
+    assert(range0 > 0);
+    assert(range0 < s->range);
+#if defined(DUMP_PUT_BIT)
+    {
+        static int count;
+        printf("%d: range=%d b=%d range0=%d low=%d\n",
+               count++, s->range, bit, range0, s->low);
+    }
+#endif
+    if (!bit) {
+        s->range = range0;
+    } else {
+        s->low += range0;
+        s->range -= range0;
+    }
+    
+    put_bit_renorm(s);
+}
+
+void put_bit_raw(PutBitState *s, int bit)
+{
+    int range0;
+    
+    assert(s->range >= RANGE_MIN);
+    range0 = s->range >> 1;
+    if (!bit) {
+        s->range = range0;
+    } else {
+        s->low += range0;
+        s->range -= range0;
+    }
+    
+    put_bit_renorm(s);
+}
+
+/* return the minimum number of bits to be able to correctly decode */
+int64_t put_bit_flush(PutBitState *s)
+{
+    int n, val, mask;
+
+    /* force larger range */
+    if (s->range < (1 << RANGE_MIN_BITS)) {
+        put_val(s, s->low >> RANGE_MIN_BITS);
+        s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
+        s->range <<= 8;
+    }
+
+    /* largest n such as 2^n <= range */
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    assert(n >= RANGE_MIN_BITS && n <= (RANGE_MIN_BITS + 7));
+
+    val = s->low;
+    mask = (1 << n) - 1;
+    if ((val & mask) != 0)
+        val = (val + (1 << n)) & ~mask;
+    assert(val >= s->low && val < s->low + s->range);
+
+    put_val(s, val >> RANGE_MIN_BITS);
+    put_val_flush(s);
+    if (s->idx > 0) {
+        s->byte_count += s->idx;
+        s->write_func(s->opaque, s->buf, s->idx);
+        s->idx = 0;
+    }
+    return (s->byte_count - 1) * 8 + (RANGE_MIN_BITS + 8 - n);
+}
+
+/* return the approximate number of written bits */
+int64_t put_bit_get_bit_count(PutBitState *s)
+{
+    int n;
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    return (s->byte_count + s->idx) * 8 + (RANGE_MIN_BITS + 7 - n);
+}
+
+/****************************************/
+
+static void refill(GetBitState *s)
+{
+    s->range <<= 8;
+    s->low <<= 8;
+    if (s->idx >= s->buf_len) {
+        if (!s->read_func)
+            return; /* pad with zeros */
+        s->buf_len = s->read_func(s->opaque, s->buf, s->buf_size);
+        s->byte_count += s->buf_len;
+        s->idx = 0;
+    }
+#ifdef DUMP_GET_BIT
+    printf("  in=%d\n", s->buf[s->idx]);
+#endif
+    s->low += s->buf[s->idx++];
+}
+
+void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
+                  GetBitReadFunc *read_func, void *opaque)
+{
+    int i;
+    s->buf_size = buf_size;
+    s->buf = buf;
+    s->read_func = read_func;
+    s->opaque = opaque;
+    if (read_func) {
+        s->buf_len = 0;
+    } else {
+        /* prefilled buffer */
+        s->buf_len = s->buf_size;
+    }
+    s->byte_count = s->buf_len;
+    s->range = 0;
+    s->low = 0;
+    s->idx = 0;
+    for(i = 0; i <= RANGE_MIN_BITS; i += 8) {
+        refill(s);
+    }
+    s->range = RANGE_MAX;
+}
+
+/* 0 < prob0 < PROB_UNIT */
+int get_bit(GetBitState *s, int prob0)
+{
+    int b, range0;
+
+    assert(s->range >= RANGE_MIN);
+    range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
+    assert(range0 > 0);
+    assert(range0 < s->range);
+    b = s->low >= range0;
+#ifdef DUMP_GET_BIT
+    {
+        static int count;
+        printf("%d: range=%d b=%d range0=%d low=%d\n", count++, s->range, b, range0, s->low);
+    }
+#endif
+    if (b) {
+        s->low -= range0;
+        s->range -= range0;
+    } else {
+        s->range = range0;
+    }
+    while (s->range < RANGE_MIN)
+        refill(s);
+    return b;
+}
+
+/* no context */
+int get_bit_raw(GetBitState *s)
+{
+    int b, range0;
+    range0 = s->range >> 1;
+    b = s->low >= range0;
+    if (b) {
+        s->low -= range0;
+        s->range -= range0;
+    } else {
+        s->range = range0;
+    }
+    if (s->range < RANGE_MIN)
+        refill(s);
+    return b;
+}
+
+/* return the approximate number of read bits */
+int64_t get_bit_get_bit_count(GetBitState *s)
+{
+    int n;
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    return (s->byte_count - s->buf_len + s->idx) * 8 - n;
+}
diff --git a/gpt2/arith.h b/gpt2/arith.h
new file mode 100644
index 0000000..d1a4e31
--- /dev/null
+++ b/gpt2/arith.h
@@ -0,0 +1,73 @@
+/*
+ * Arithmetic coder
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef ARITH_H
+#define ARITH_H
+
+#define PROB_UNIT_BITS 15
+#define PROB_UNIT (1 << PROB_UNIT_BITS)
+
+typedef void PutBitWriteFunc(void *opaque, const uint8_t *buf, size_t buf_size);
+
+typedef struct {
+    uint32_t range;
+    uint32_t low;
+    uint8_t current_byte;
+    uint32_t n_bytes;
+    uint8_t *buf;
+    size_t buf_size;
+    size_t idx; /* current position in bytes */
+    PutBitWriteFunc *write_func;
+    void *opaque;
+    uint64_t byte_count;
+} PutBitState;
+
+void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
+                  PutBitWriteFunc *write_func, void *opaque);
+void put_bit(PutBitState *s, int prob0, int bit);
+void put_bit_raw(PutBitState *s, int bit);
+int64_t put_bit_flush(PutBitState *s);
+int64_t put_bit_get_bit_count(PutBitState *s);
+
+/* return the number of read bytes */
+typedef ssize_t GetBitReadFunc(void *opaque, uint8_t *buf, size_t buf_size);
+
+typedef struct {
+    uint8_t *buf;
+    int buf_len;
+    int buf_size;
+    int idx;
+    uint32_t low;
+    uint32_t range;
+    GetBitReadFunc *read_func;
+    void *opaque;
+    uint64_t byte_count;
+} GetBitState;
+
+void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
+                  GetBitReadFunc *read_func, void *opaque);
+int get_bit(GetBitState *s, int prob0);
+int get_bit_raw(GetBitState *s);
+int64_t get_bit_get_bit_count(GetBitState *s);
+
+#endif /* ARITH_H */
diff --git a/gpt2/cp_utils.c b/gpt2/cp_utils.c
new file mode 100644
index 0000000..d049d92
--- /dev/null
+++ b/gpt2/cp_utils.c
@@ -0,0 +1,316 @@
+/*
+ * Compression utilities
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <time.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#ifdef _WIN32
+#include <direct.h>
+#endif
+
+#include "cutils.h"
+#include "libnc.h"
+#include "cp_utils.h"
+
+void fatal_error(const char *fmt, ...)
+{
+    va_list ap;
+    
+    va_start(ap, fmt);
+    fprintf(stderr, "Fatal error: ");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    exit(1);
+}
+
+int64_t get_time_ms(void)
+{
+#ifdef _WIN32
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (int64_t)tv.tv_sec * 1000 + (tv.tv_usec / 1000U);
+#else
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000 + (ts.tv_nsec / 1000000U);
+#endif
+}
+
+void fput_u8(FILE *f, uint8_t v)
+{
+    fputc(v, f);
+}
+
+int fget_u8(FILE *f, uint8_t *pv)
+{
+    int c;
+    c = fgetc(f);
+    if (c < 0)
+        return -1;
+    *pv = c;
+    return 0;
+}
+
+void fput_be16(FILE *f, uint16_t v)
+{
+    fputc(v >> 8, f);
+    fputc(v >> 0, f);
+}
+
+int fget_be16(FILE *f, uint16_t *pv)
+{
+    uint8_t buf[2];
+    if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
+        return -1;
+    *pv = (buf[0] << 8) |
+        (buf[1] << 0);
+    return 0;
+}
+
+void fput_be32(FILE *f, uint32_t v)
+{
+    fputc(v >> 24, f);
+    fputc(v >> 16, f);
+    fputc(v >> 8, f);
+    fputc(v >> 0, f);
+}
+
+int fget_be32(FILE *f, uint32_t *pv)
+{
+    uint8_t buf[4];
+    if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
+        return -1;
+    *pv = (buf[0] << 24) |
+        (buf[1] << 16) |
+        (buf[2] << 8) |
+        (buf[3] << 0);
+    return 0;
+}
+
+void fput_sgd_opt(FILE *f, const SGDOptParams *p)
+{
+    fput_u8(f, p->algo);
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        break;
+    case SGD_OPT_ADAM:
+        fput_f32(f, p->u.adam.beta1);
+        fput_f32(f, p->u.adam.beta2);
+        fput_f32(f, p->u.adam.eps);
+        fput_f32(f, p->u.adam.gradient_clip);
+        break;
+    default:
+        abort();
+    }
+}
+
+int fget_sgd_opt(FILE *f, SGDOptParams *p)
+{
+    uint8_t v8;
+    
+    if (fget_u8(f, &v8))
+        return -1;
+    p->algo = v8;
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        break;
+    case SGD_OPT_ADAM:
+        if (fget_f32(f, &p->u.adam.beta1))
+            return -1;
+        if (fget_f32(f, &p->u.adam.beta2))
+            return -1;
+        if (fget_f32(f, &p->u.adam.eps))
+            return -1;
+        if (fget_f32(f, &p->u.adam.gradient_clip))
+            return -1;
+        break;
+    default:
+        return -1;
+    }
+    return 0;
+}
+
+void dump_sgd_opt_params(FILE *f, const SGDOptParams *p)
+{
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        fprintf(f, " sgd_opt=%s", 
+               "none");
+        break;
+    case SGD_OPT_ADAM:
+        fprintf(f, " sgd_opt=%s beta1=%g beta2=%g eps=%g gclip=%g",
+               "adam",
+                p->u.adam.beta1,
+                p->u.adam.beta2,
+                p->u.adam.eps,
+                p->u.adam.gradient_clip);
+        break;
+    default:
+        abort();
+    }
+}
+
+typedef union {
+    float f;
+    uint32_t u32;
+} f32;
+
+void fput_f32(FILE *f, float v)
+{
+    f32 u;
+    u.f = v;
+    fput_be32(f, u.u32);
+}
+
+int fget_f32(FILE *f, float *pv)
+{
+    f32 u;
+    if (fget_be32(f, &u.u32))
+        return -1;
+    *pv = u.f;
+    return 0;
+}
+
+void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym)
+{
+    int start, range, prob0, bit, range0;
+    float p, p0;
+    
+    start = 0;
+    range = n_symb;
+    p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
+    while (range > 1) {
+        range0 = range >> 1;
+        p0 = vec_sum_f32(prob_table + start, range0);
+        prob0 = lrintf(p0 * PROB_UNIT / p);
+        prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
+        bit = sym >= (start + range0);
+        put_bit(pb, prob0, bit);
+        if (bit) {
+            start += range0;
+            range = range - range0;
+            p = p - p0;
+        } else {
+            p = p0;
+            range = range0;
+        }
+    }
+}
+
+int read_sym(GetBitState *gb, const float *prob_table, int n_symb)
+{
+    int start, range, prob0, bit, range0;
+    float p, p0;
+    
+    start = 0;
+    range = n_symb;
+    p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
+    while (range > 1) {
+        range0 = range >> 1;
+        p0 = vec_sum_f32(prob_table + start, range0);
+        prob0 = lrintf(p0 * PROB_UNIT / p);
+        prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
+        bit = get_bit(gb, prob0);
+        if (bit) {
+            start += range0;
+            range = range - range0;
+            p = p - p0;
+        } else {
+            p = p0;
+            range = range0;
+        }
+    }
+    return start;
+}
+
+void create_debug_dir(char *debug_dir, size_t debug_dir_size,
+                      const char *debug_path, const char *prefix)
+{
+    char name1[1024];
+    struct tm *tm;
+    time_t ti;
+    
+    snprintf(name1, sizeof(name1), "%s/%s", debug_path, prefix);
+#ifdef _WIN32
+    _mkdir(name1);
+#else
+    mkdir(name1, 0777);
+#endif
+    
+    ti = time(NULL);
+    tm = localtime(&ti);
+    snprintf(debug_dir, debug_dir_size, "%s/%04u%02u%02u-%02u%02u%02u",
+             name1,
+             tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+             tm->tm_hour, tm->tm_min, tm->tm_sec);
+#ifdef _WIN32
+    _mkdir(debug_dir);
+#else
+    mkdir(debug_dir, 0777);
+#endif
+}
+
+/* we print at least 3 significant digits with at most 5 chars, except
+   if larger than 9999T. The value is rounded to zero. */
+char *get_si_prefix(char *buf, int buf_size, uint64_t val)
+{
+    static const char suffixes[4] = "kMGT";
+    uint64_t base;
+    int i;
+
+    if (val <= 999) {
+        snprintf(buf, buf_size, "%" PRId64, val);
+    } else {
+        base = 1000;
+        for(i=0;i<4;i++) {
+            /* Note: we round to 0 */
+            if (val < base * 10) {
+                snprintf(buf, buf_size, "%0.2f%c", 
+                         floor((val * 100.0) / base) / 100.0,
+                         suffixes[i]);
+                break;
+            } else if (val < base * 100) {
+                snprintf(buf, buf_size, "%0.1f%c", 
+                         floor((val * 10.0) / base) / 10.0,
+                         suffixes[i]);
+                break;
+            } else if (val < base * 1000 || (i == 3)) {
+                snprintf(buf, buf_size,
+                         "%" PRId64 "%c", 
+                         val / base,
+                         suffixes[i]);
+                break;
+            }
+            base = base * 1000;
+        }
+    }
+    return buf;
+}
diff --git a/gpt2/cp_utils.h b/gpt2/cp_utils.h
new file mode 100644
index 0000000..74deaa0
--- /dev/null
+++ b/gpt2/cp_utils.h
@@ -0,0 +1,48 @@
+/*
+ * Compression utilities
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "arith.h"
+#include "libnc.h"
+
+void __attribute__((noreturn, format(printf, 1, 2))) fatal_error(const char *fmt, ...);
+
+int64_t get_time_ms(void);
+void fput_u8(FILE *f, uint8_t v);
+int fget_u8(FILE *f, uint8_t *pv);
+void fput_be16(FILE *f, uint16_t v);
+int fget_be16(FILE *f, uint16_t *pv);
+void fput_be32(FILE *f, uint32_t v);
+int fget_be32(FILE *f, uint32_t *pv);
+void fput_f32(FILE *f, float v);
+int fget_f32(FILE *f, float *pv);
+void fput_sgd_opt(FILE *f, const SGDOptParams *p);
+int fget_sgd_opt(FILE *f, SGDOptParams *p);
+void dump_sgd_opt_params(FILE *f, const SGDOptParams *p);
+
+void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym);
+int read_sym(GetBitState *gb, const float *prob_table, int n_symb);
+
+void create_debug_dir(char *debug_dir, size_t debug_dir_size,
+                      const char *debug_path, const char *prefix);
+char *get_si_prefix(char *buf, int buf_size, uint64_t val);
+
diff --git a/gpt2/cutils.h b/gpt2/cutils.h
new file mode 100644
index 0000000..68c1df2
--- /dev/null
+++ b/gpt2/cutils.h
@@ -0,0 +1,152 @@
+#ifndef CUTILS_H
+#define CUTILS_H
+
+#include <inttypes.h>
+
+#define force_inline inline __attribute__((always_inline))
+#define no_inline __attribute__((noinline))
+#define __unused __attribute__((unused))
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#ifndef offsetof
+#define offsetof(type, field) ((size_t) &((type *)0)->field)
+#endif
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+typedef int BOOL;
+
+#ifndef FALSE
+enum {
+    FALSE = 0,
+    TRUE = 1,
+};
+#endif
+
+typedef struct {
+    uint16_t u16;
+} bfloat16_t;
+
+#if defined(__x86_64__)
+static inline int64_t get_cycles(void)
+{
+    uint32_t low,high;
+    int64_t val;
+    asm volatile("rdtsc" : "=a" (low), "=d" (high));
+    val = high;
+    val <<= 32;
+    val |= low;
+    return val;
+}
+#else
+static inline int64_t get_cycles(void)
+{
+    int64_t val;
+    asm volatile ("rdtsc" : "=A" (val));
+    return val;
+}
+#endif
+
+static inline int max_int(int a, int b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline int min_int(int a, int b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline size_t max_size_t(size_t a, size_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline size_t min_size_t(size_t a, size_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline ssize_t max_ssize_t(ssize_t a, ssize_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline ssize_t min_ssize_t(ssize_t a, ssize_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline int clamp_int(int val, int min_val, int max_val)
+{
+    if (val < min_val)
+        return min_val;
+    else if (val > max_val)
+        return max_val;
+    else
+        return val;
+}
+
+static inline float clamp_float(float val, float min_val, float max_val)
+{
+    if (val < min_val)
+        return min_val;
+    else if (val > max_val)
+        return max_val;
+    else
+        return val;
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz32(unsigned int a)
+{
+    return __builtin_clz(a);
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz64(uint64_t a)
+{
+    return __builtin_clzll(a);
+}
+
+static inline int floor_log2(uint64_t a)
+{
+    return 63 - clz64(a);
+}
+
+static inline int ceil_log2(uint64_t a)
+{
+    if (a <= 1)
+        return 0;
+    else
+        return 64 - clz64(a - 1);
+}
+
+static inline float squaref(float x)
+{
+    return x * x;
+}
+
+#define DUP8(a) a, a, a, a, a, a, a, a
+
+#endif /* CUTILS_H */
+
diff --git a/gpt2/gpt2tc.c b/gpt2/gpt2tc.c
new file mode 100644
index 0000000..3ffd4ae
--- /dev/null
+++ b/gpt2/gpt2tc.c
@@ -0,0 +1,2023 @@
+/*
+ * Text Completion with GPT-2 Transformer
+ * 
+ * Copyright (c) 2019-2021 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <time.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <sys/time.h>
+#include <ctype.h>
+#include <pthread.h>
+
+#include "cutils.h"
+#include "arith.h"
+#include "libnc.h"
+#include "cp_utils.h"
+#include "list.h"
+#include "gpt2tc.h"
+
+
+/************************************************/
+/* Transformer model */
+
+static int nb_threads = 1;
+
+/* [seg_len, d_model] -> 
+   [n_head, seg_len, d_model/n_head] */
+static NCTensor *split_head(NCTensor *x, int n_head)
+{
+    const size_t *dims;
+    int n_dims, axis[3];
+    
+    dims = nc_tensor_get_dims(x, &n_dims);
+    assert(n_dims == 2);
+    assert((dims[0] % n_head) == 0);
+    x = nc_reshape_3d(x, dims[0] / n_head, n_head, dims[1]);
+    /* [seg_len, n_head, d_model/n_head] */
+    axis[0] = 0;
+    axis[1] = 2;
+    axis[2] = 1;
+    return nc_permute(x, 3, axis);
+}
+
+/* [n_head, seg_len, d_value]
+   -> [seg_len, d_value * n_head] */
+static NCTensor *concat_head(NCTensor *x)
+{
+    const size_t *dims;
+    int n_dims, axis[3];
+    
+    axis[0] = 0;
+    axis[1] = 2;
+    axis[2] = 1;
+    x = nc_permute(x, 3, axis);
+    dims = nc_tensor_get_dims(x, &n_dims);
+    assert(n_dims == 3);
+    /* [seg_len, n_head, d_value] */
+    return nc_reshape_2d(x, dims[0] * dims[1], dims[2]);
+}
+
+#define MAT_STRIDE 64
+
+/* convert the matrix to strided representation */
+static void convert_mat(NCTensor **pw)
+{
+    NCTensor *w;
+    int m, n, n_dims, r;
+    const size_t *dims;
+    int axis[3];
+    
+    w = *pw;
+    dims = nc_tensor_get_dims(w, &n_dims);
+    assert(n_dims == 2);
+    m = dims[0];
+    n = dims[1];
+    r = (-m) % MAT_STRIDE;
+    if (r < 0)
+        r += MAT_STRIDE;
+    w = nc_pad(w, 0, NC_PAD_ZERO, r, NC_PAD_ZERO);
+    w = nc_reshape_3d(w, MAT_STRIDE, (m + MAT_STRIDE - 1) / MAT_STRIDE, n);
+    axis[0] = 0;
+    axis[1] = 2;
+    axis[2] = 1;
+    w = nc_permute(w, 3, axis);
+    *pw = w;
+}
+
+static TransformerModel *trf_init(const TransformerModelParams *p,
+                                  const char *coefs_filename)
+{
+    TransformerModel *s;
+    NCContext *m;
+    NCDevice *d;
+    int layer_idx;
+    TransformerLayer *layers, *tl;
+    
+    s = nc_mallocz(sizeof(*s));
+    rnd_init(&s->rnd_state, p->seed);
+    s->n_layer = p->n_layer;
+    s->d_model = p->d_model;
+    s->n_head = p->n_head;
+    s->d_key = p->d_key;
+    s->d_value = p->d_value;
+    s->d_inner = p->d_inner;
+    s->n_ctx = p->n_ctx;
+    s->n_symbols = p->n_symbols;
+    
+    m = nc_context_init(nb_threads);
+    s->model = m;
+    d = nc_new_cpu_device(m);
+    s->device = d;
+    
+    nc_param_list_init(&s->param_list);
+    /* disable graph for the parameters */
+    nc_param_list_set_graph(&s->param_list, FALSE);
+    
+    layers = nc_mallocz(sizeof(layers[0]) * s->n_layer);
+    s->layers = layers;
+    for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) {
+        tl = &layers[layer_idx];
+        tl->ln_1_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->ln_1_g, "h%d/ln_1/g", layer_idx);
+
+        tl->ln_1_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->ln_1_b, "h%d/ln_1/b", layer_idx);
+        
+        tl->attn_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->n_head * s->d_key * 3,
+                                      s->d_model);
+        nc_new_param(&s->param_list, &tl->attn_w,
+                     "h%d/attn/c_attn/w", layer_idx);
+
+        tl->attn_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->n_head * s->d_key * 3);
+        nc_new_param(&s->param_list, &tl->attn_b,
+                     "h%d/attn/c_attn/b", layer_idx);
+        
+        tl->attn_proj_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model,
+                                           s->n_head * s->d_value);
+        nc_new_param(&s->param_list, &tl->attn_proj_w,
+                     "h%d/attn/c_proj/w", layer_idx);
+
+        tl->attn_proj_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->attn_proj_b,
+                     "h%d/attn/c_proj/b", layer_idx);
+        
+        tl->ln_2_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->ln_2_g, "h%d/ln_2/g", layer_idx);
+        
+        tl->ln_2_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->ln_2_b, "h%d/ln_2/b", layer_idx);
+        
+        tl->mlp_fc_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_inner,
+                                        s->d_model);
+        nc_new_param(&s->param_list, &tl->mlp_fc_w,
+                     "h%d/mlp/c_fc/w", layer_idx);
+
+        tl->mlp_fc_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_inner);
+        nc_new_param(&s->param_list, &tl->mlp_fc_b,
+                     "h%d/mlp/c_fc/b", layer_idx);
+        
+        tl->mlp_proj_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model,
+                                          s->d_inner);
+        nc_new_param(&s->param_list, &tl->mlp_proj_w,
+                     "h%d/mlp/c_proj/w", layer_idx);
+
+        tl->mlp_proj_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+        nc_new_param(&s->param_list, &tl->mlp_proj_b,
+                     "h%d/mlp/c_proj/b", layer_idx);
+    }
+    
+    s->ln_f_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+    nc_new_param(&s->param_list, &s->ln_f_g, "ln_f/g");
+    
+    s->ln_f_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model);
+    nc_new_param(&s->param_list, &s->ln_f_b, "ln_f/b");
+    
+    s->wte = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model,
+                              s->n_symbols);
+    nc_new_param(&s->param_list, &s->wte, "wte");
+    
+    s->wpe = nc_new_tensor_2d(d, NC_TYPE_F32, s->d_model,
+                              s->n_ctx);
+    nc_new_param(&s->param_list, &s->wpe, "wpe");
+    
+    nc_load_coefs(&s->param_list, coefs_filename);
+
+    /* optimize the variable storage */
+    s->wte_trans = nc_transpose(nc_dup_tensor(s->wte));
+
+    convert_mat(&s->wte_trans);
+    
+    for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) {
+        tl = &layers[layer_idx];
+        convert_mat(&tl->attn_w);
+        convert_mat(&tl->attn_proj_w);
+        convert_mat(&tl->mlp_fc_w);
+        convert_mat(&tl->mlp_proj_w);
+    }
+    return s;
+}
+
+typedef struct {
+    int mem_len;
+    NCTensor **mem_k;
+    NCTensor **mem_v;
+} BatchEntry;
+
+/* dimensions: output[train_len * n_streams][n_symbols],
+   input[train_len * n_streams], tab_mem[n_streams], mem_k[n_layer]
+   mem_v[n_layer]. */
+static NCTensor *trf_eval(TransformerModel *s, int train_len,
+                          int n_streams, BatchEntry *tab_mem,
+                          NCTensor *input)
+{
+    NCTensor *layer_input, **tab_tmp, *output, *position;
+    TransformerLayer *tl;
+    int layer_idx, i, j, *ptr;
+    BatchEntry *be;
+    
+    tab_tmp = nc_mallocz(sizeof(tab_tmp[0]) *
+                          max_int(max_int(3, train_len),
+                                  max_int(s->n_head, s->n_layer)));
+
+    position = nc_new_tensor_1d(s->device, NC_TYPE_I32,
+                                train_len * n_streams);
+    ptr = nc_tensor_get_ptr(position, NULL);
+    for(i = 0; i < train_len; i++) {
+        for(j = 0; j < n_streams; j++) {
+            ptr[i * n_streams + j] = tab_mem[j].mem_len + i;
+        }
+    }
+    
+    layer_input = nc_get_col(nc_dup_tensor(s->wte), input);
+    layer_input = nc_convert(layer_input, NC_TYPE_F32);
+    layer_input = nc_add(layer_input, nc_get_col(nc_dup_tensor(s->wpe),
+                                                 position));
+                         
+    for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) {
+        NCTensor *query, *key, *value, *ff_input, *t0, **tab_tmp2;
+
+        tl = &s->layers[layer_idx];
+
+        t0 = nc_add(nc_mul(nc_layer_norm(nc_dup_tensor(layer_input), 1e-5),
+                           nc_dup_tensor(tl->ln_1_g)),
+                    nc_dup_tensor(tl->ln_1_b));
+
+        t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->attn_w), t0),
+                    nc_dup_tensor(tl->attn_b));
+        tab_tmp2 = nc_mallocz(sizeof(tab_tmp2[0]) * n_streams);
+
+        /* [ train_len * n_streams d_model * 3] ->
+           n_streams * [ train_len d_model * 3] */
+        nc_hsplit(tab_tmp2, t0, n_streams, NULL);
+        for(i = 0; i < n_streams; i++) {
+            be = &tab_mem[i];
+            
+            t0 = tab_tmp2[i];
+            nc_vsplit(tab_tmp, t0, 3, NULL);
+            query = tab_tmp[0];
+            key = tab_tmp[1];
+            value = tab_tmp[2];
+
+            /* split query, key and value for each head */
+            key = split_head(key, s->n_head);
+            query = split_head(query, s->n_head);
+            value = split_head(value, s->n_head);
+
+            /* save the key and value to the memory */
+            t0 = nc_slice_alias(be->mem_k[layer_idx],
+                                1, be->mem_len, be->mem_len + train_len);
+            nc_tensor_copy(t0, key);
+            nc_free_tensor(t0);
+            nc_free_tensor(key);
+
+            t0 = nc_slice_alias(be->mem_v[layer_idx],
+                                1, be->mem_len, be->mem_len + train_len);
+            nc_tensor_copy(t0, value);
+            nc_free_tensor(t0);
+            nc_free_tensor(value);
+            
+            key = nc_slice_alias(be->mem_k[layer_idx], 
+                                 1, 0, be->mem_len + train_len);
+            value = nc_slice_alias(be->mem_v[layer_idx],
+                                   1, 0, be->mem_len + train_len);
+            
+            /* cross product term */
+            t0 = nc_matmul_add(key, query, NULL,
+                               TRUE, FALSE);
+            t0 = nc_mul(t0, nc_new_f32(s->device, 1.0f / sqrtf(s->d_key)));
+
+            /* set the future cross products to -infinity so that they
+               don't change the softmax result */
+            t0 = nc_slt_mat_set(t0, be->mem_len + 1, -INFINITY);
+        
+            t0 = nc_soft_max(t0);
+            t0 = nc_matmul(value, t0);
+
+            /* merge all the heads */
+            tab_tmp2[i] = concat_head(t0);
+        }
+
+        t0 = nc_hconcat(tab_tmp2, n_streams);
+        nc_free(tab_tmp2);
+        
+        /* projection */
+        t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->attn_proj_w), t0),
+                    nc_dup_tensor(tl->attn_proj_b));
+        
+        t0 = nc_add(t0, layer_input);
+
+        ff_input = nc_dup_tensor(t0);
+
+        t0 = nc_add(nc_mul(nc_layer_norm(t0, 1e-5),
+                           nc_dup_tensor(tl->ln_2_g)),
+                    nc_dup_tensor(tl->ln_2_b));
+        
+        t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->mlp_fc_w), t0),
+                    nc_dup_tensor(tl->mlp_fc_b));
+        t0 = nc_gelu(t0);
+            
+        t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->mlp_proj_w), t0),
+                    nc_dup_tensor(tl->mlp_proj_b));
+        
+        layer_input = nc_add(t0, ff_input);
+    }
+    
+    {
+        NCTensor *t0;
+        t0 = nc_add(nc_mul(nc_layer_norm(layer_input, 1e-5),
+                           nc_dup_tensor(s->ln_f_g)),
+                    nc_dup_tensor(s->ln_f_b));
+
+        t0 = nc_matmul_stride(nc_dup_tensor(s->wte_trans), t0);
+        /* need to resize the output to the exact size because the
+           strided matrix is larger */
+        output = nc_resize(t0, s->n_symbols);
+    }
+    nc_free(tab_tmp);
+    return output;
+}
+
+static void trf_end(TransformerModel *s)
+{
+    nc_free_tensor(s->wte_trans);
+
+    nc_param_list_end(&s->param_list);
+    nc_free(s->layers);
+    nc_context_end(s->model);
+    nc_free(s);
+}
+
+static const char *gpt2_model_name[] = { "117M", "345M", "774M", "1558M" };
+
+GPT2ModelEnum parse_model(const char *str)
+{
+    int i;
+    for(i = 0; i < countof(gpt2_model_name); i++) {
+        if (!strcmp(gpt2_model_name[i], str))
+            return i;
+    }
+    return (GPT2ModelEnum)-1;
+}
+
+void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model)
+{
+    memset(p, 0, sizeof(*p));
+    p->seed = 123;
+    switch(model) {
+    case GPT2_MODEL_117M:
+        p->n_layer = 12;
+        p->d_model = 768;
+        break;
+    case GPT2_MODEL_345M:
+        p->n_layer = 24;
+        p->d_model = 1024;
+        break;
+    case GPT2_MODEL_774M:
+        p->n_layer = 36;
+        p->d_model = 1280;
+        break;
+    case GPT2_MODEL_1558M:
+        p->n_layer = 48;
+        p->d_model = 1600;
+        break;
+    default:
+        abort();
+    }
+    p->d_key = 64;
+    p->n_head = p->d_model / p->d_key;
+    p->d_value = p->d_key;
+    p->d_inner = p->d_model * 4;
+    p->n_ctx = 1024;
+    p->n_symbols = 50257;
+}
+
+typedef uint16_t DataSymbol;
+
+/****************************************************************/
+/* preprocessor */
+
+static uint32_t hash_calc(const uint8_t *buf, int len, int n_bits)
+{
+    uint32_t h;
+    int i;
+
+    h = 1;
+    for(i = 0; i < len; i++) {
+        h = h * 263 + buf[i];
+    }
+    return h & ((1 << n_bits) - 1);
+}
+
+static void hash_resize(WordList *s, int hash_bits)
+{
+    int i, h;
+    Word *p;
+    
+    s->hash_bits = hash_bits;
+    s->hash_size = 1 << hash_bits;
+    free(s->hash_table);
+    s->hash_table = malloc(sizeof(s->hash_table[0]) * s->hash_size);
+    for(i = 0; i < s->hash_size; i++)
+        s->hash_table[i] = -1;
+    for(i = 0; i < s->word_count; i++) {
+        p = &s->words[i];
+        h = hash_calc(p->buf, p->len, s->hash_bits);
+        p->next = s->hash_table[h];
+        s->hash_table[h] = i;
+    }
+}
+
+static WordList *word_list_init(void)
+{
+    WordList *s;
+    
+    s = malloc(sizeof(WordList));
+    memset(s, 0, sizeof(*s));
+    s->word_count = 0;
+    s->word_size = 0;
+    hash_resize(s, 12);
+    return s;
+}
+
+static void word_list_end(WordList *s)
+{
+    int i;
+    Word *p;
+    
+    for(i = 0; i < s->word_count; i++) {
+        p = &s->words[i];
+        free(p->buf);
+    }
+    free(s->words);
+    free(s->hash_table);
+    free(s);
+}
+
+static int64_t hash_lookup_count;
+static int64_t hash_it_count;
+
+/* the hash size contains HASH_SIZE_FACTOR times more entries */
+#define HASH_SIZE_FACTOR 2
+
+static Word *word_find_add(WordList *s, const uint8_t *buf, int len, int add)
+{
+    uint32_t h, idx;
+    Word *p;
+
+    h = hash_calc(buf, len, s->hash_bits);
+    idx = s->hash_table[h];
+    hash_lookup_count++;
+    while (idx != -1) {
+        hash_it_count++;
+        p = &s->words[idx];
+        if (p->len == len && !memcmp(p->buf, buf, len))
+            return p;
+        idx = p->next;
+    }
+
+    if (!add)
+        return NULL;
+
+    if (s->word_count >= s->word_size) {
+        size_t new_size = s->word_size + s->word_size / 2;
+        if (new_size < 32)
+            new_size = 32;
+        if (s->word_count + 1 > new_size)
+            new_size = s->word_count + 1;
+        s->words = realloc(s->words, new_size * sizeof(s->words[0]));
+        s->word_size = new_size;
+
+    }
+    /* resize the hash table when needed */
+    if ((s->word_count * HASH_SIZE_FACTOR) > s->hash_size) {
+        int hash_bits = s->hash_bits;
+        while ((s->word_count * HASH_SIZE_FACTOR) > (1 << hash_bits))
+            hash_bits++;
+        hash_resize(s, hash_bits);
+        
+        /* recompute the hash with the new hash table size */
+        h = hash_calc(buf, len, s->hash_bits);
+    }
+
+    idx = s->word_count++;
+    p = &s->words[idx];
+    p->len = len;
+    p->buf = malloc(len + 1);
+    memcpy(p->buf, buf, len);
+    p->buf[len] = 0;
+    p->next = s->hash_table[h];
+    s->hash_table[h] = idx;
+    return p;
+}
+
+static void word_load(WordList *s, const char *filename)
+{
+    FILE *f;
+    uint8_t buf[1024];
+    int len, c;
+    
+    f = fopen(filename, "rb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+    len = 0;
+    for(;;) {
+        c = fgetc(f);
+        if (c < 0)
+            break;
+        if (c == '\n') {
+            if (len > 0) {
+                word_find_add(s, buf, len, TRUE);
+            }
+            len = 0;
+        } else {
+            if (c == '\\') {
+                c = fgetc(f);
+                if (c < 0)
+                    break;
+                if (c == 'n') {
+                    c = '\n';
+                } else if (c != '\\') {
+                    fprintf(stderr, "Invalid escape\n");
+                    exit(1);
+                }
+            }
+            if (len >= sizeof(buf)) {
+                fprintf(stderr, "Word too long\n");
+                exit(1);
+            }
+            buf[len++] = c;
+        }
+    }
+    fclose(f);
+}
+
+typedef enum {
+    CAT_SPACE,
+    CAT_LETTER,
+    CAT_NUMBER,
+    CAT_OTHER,
+} CharCatEnum;
+
+static int get_char_cat(int c)
+{
+    if (c == ' ') {
+        return CAT_SPACE;
+    } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || 
+               c >= 128) {
+        return CAT_LETTER;
+    } else if (c >= '0' && c <= '9') {
+        return CAT_NUMBER;
+    } else {
+        return CAT_OTHER;
+    }
+}
+
+static BOOL match(size_t *pmatch_len,
+                  const uint8_t *buf, size_t buf_len, const char *str)
+{
+    size_t len;
+    len = strlen(str);
+    if (len <= buf_len && !memcmp(buf, str, len)) {
+        *pmatch_len = len;
+        return TRUE;
+    } else {
+        *pmatch_len = 0;
+        return FALSE;
+    }
+}
+
+static size_t gpt2_get_word(const uint8_t *buf, size_t buf_len)
+{
+    size_t len, p;
+    int cat;
+    
+    if (buf_len == 0)
+        return 0;
+    if (buf[0] == '\'' &&
+        (match(&len, buf, buf_len, "'s") ||
+         match(&len, buf, buf_len, "'t") ||
+         match(&len, buf, buf_len, "'re") ||
+         match(&len, buf, buf_len, "'ve") ||
+         match(&len, buf, buf_len, "'m") ||
+         match(&len, buf, buf_len, "'ll") ||
+         match(&len, buf, buf_len, "'d"))) {
+        return len;
+    }
+    p = 0;
+    if (buf[0] == ' ' && buf_len >= 2)
+        p++;
+    if (buf[p] != ' ') {
+        cat = get_char_cat(buf[p]);
+        len = 1 + p;
+        while (len < buf_len && get_char_cat(buf[len]) == cat)
+            len++;
+        return len;
+    } else {
+        return 1;
+    }
+}
+
+static __unused void print_word(const uint8_t *buf, size_t len)
+{
+    size_t i;
+    int c;
+    for(i = 0; i < len; i++) {
+        c = buf[i];
+        if (c >= ' ' && c <= '~')
+            putchar(c);
+        else
+            printf("\\x%02x", c);
+    }
+}
+
+void gpt2_pp_encode(const char *word_filename,
+                    const char *in_filename, const char *out_filename)
+{
+    FILE *f, *fo;
+    size_t buf_size, buf_pos, word_len, len, i;
+    uint8_t *buf;
+    WordList *s;
+    Word *p;
+    
+    f = fopen(in_filename, "rb");
+    if (!f) {
+        perror(in_filename);
+        exit(1);
+    }
+    
+    fseek(f, 0, SEEK_END);
+    buf_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf = malloc(buf_size * sizeof(buf[0]));
+    fread(buf, 1, buf_size, f);
+    fclose(f);
+
+    s = word_list_init();
+    word_load(s, word_filename);
+    
+    fo = fopen(out_filename, "wb");
+    if (!fo) {
+        perror(out_filename);
+        exit(1);
+    }
+
+    for(buf_pos = 0; buf_pos < buf_size; buf_pos += word_len) {
+        word_len = gpt2_get_word(buf + buf_pos, buf_size - buf_pos);
+#if 0
+        print_word(buf + buf_pos, word_len);
+        printf("\n");
+#endif
+        /* find the longest word(s) */
+        for(i = 0; i < word_len; i += len) {
+            for(len = word_len - i; len >= 1; len--) {
+                p = word_find_add(s, buf + buf_pos + i, len, FALSE);
+                if (p)
+                    break;
+            }
+            assert(len >= 1);
+            fput_be16(fo, p - s->words);
+        }
+    }
+
+    free(buf);
+    
+    fclose(fo);
+
+    word_list_end(s);
+}
+
+#define SYMB_EOT 50256
+
+static void add_char(DataSymbol **pbuf,
+                     size_t *psize, size_t *plen, DataSymbol c)
+{
+    size_t len = *plen, size = *psize;
+    if ((len + 1) > size) {
+        size = max_size_t(max_size_t(len + 1, 4),
+                          size * 3 / 2);
+        *pbuf = realloc(*pbuf, sizeof(**pbuf) * size);
+        *psize = size;
+    }
+    (*pbuf)[len++] = c;
+    *plen = len;
+}
+
+static void gpt2_pp_encode_buf1(WordList *s, 
+                                DataSymbol **pout_buf,
+                                size_t *pout_buf_size, size_t *pout_buf_len,
+                                const uint8_t *buf, size_t buf_size)
+{
+    size_t buf_pos, word_len, len, i;
+    Word *p;
+    
+    for(buf_pos = 0; buf_pos < buf_size; buf_pos += word_len) {
+        word_len = gpt2_get_word(buf + buf_pos, buf_size - buf_pos);
+#if 0
+        print_word(buf + buf_pos, word_len);
+        printf("\n");
+#endif
+        /* find the longest word(s) */
+        for(i = 0; i < word_len; i += len) {
+            for(len = word_len - i; len >= 1; len--) {
+                p = word_find_add(s, buf + buf_pos + i, len, FALSE);
+                if (p)
+                    break;
+            }
+            assert(len >= 1);
+            add_char(pout_buf, pout_buf_size, pout_buf_len, p - s->words);
+        }
+    }
+}
+
+size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf,
+                          const uint8_t *buf, size_t buf_size)
+{
+    size_t out_buf_len, out_buf_size;
+    DataSymbol *out_buf;
+
+    out_buf_len = 0;
+    out_buf_size = 0;
+    out_buf = NULL;
+    gpt2_pp_encode_buf1(s, &out_buf, &out_buf_size, &out_buf_len,
+                        buf, buf_size);
+    *pout_buf = out_buf;
+    return out_buf_len;
+}
+
+void gpt2_pp_decode(const char *word_filename,
+                    const char *in_filename, const char *out_filename)
+{
+    WordList *s;
+    FILE *f, *fo;
+    uint16_t c;
+    Word *p;
+    
+    s = word_list_init();
+    word_load(s, word_filename);
+
+    f = fopen(in_filename, "rb");
+    if (!f) {
+        perror(in_filename);
+        exit(1);
+    }
+    
+    fo = fopen(out_filename, "wb");
+    if (!fo) {
+        perror(out_filename);
+        exit(1);
+    }
+
+    for(;;) {
+        if (fget_be16(f, &c))
+            break;
+        if (c >= s->word_count) {
+            fprintf(stderr, "Invalid symbol: %d\n", c);
+            exit(1);
+        }
+        p = &s->words[c];
+        fwrite(p->buf, 1, p->len, fo);
+    }
+
+    fclose(fo);
+
+    fclose(f);
+    
+    word_list_end(s);
+}
+
+static struct option options[] = {
+    { NULL },
+};
+
+/****************************************************************/
+/* text completion */
+
+static int get_random_symb_topk(float *prob, size_t n_symb, int topk,
+                                float topp, RNDState *rnd_state)
+{
+    NCTopKEntry *tab;
+    int i, c, k;
+    float p;
+    double sum;
+    
+    assert(n_symb >= 1);
+
+    prof_start(PROF_WRITE_SYM);
+    k = nc_topk(&tab, &sum, prob, n_symb, topk, topp);
+    prof_end(PROF_WRITE_SYM);
+    
+    p = rnd_unif(rnd_state) * sum;
+    
+    sum = 0;
+    for(i = 0; i < k - 1; i++) {
+        sum += prob[tab[i].idx];
+        if (p < sum)
+            break;
+    }
+    c = tab[i].idx;
+    nc_free(tab);
+    return c;
+}
+
+static void dump_pred_symb(float *prob, size_t n_symb, int k,
+                           WordList *wl)
+{
+#if 0
+    int *tab, i, c;
+    Word *wp;
+
+    assert(n_symb >= 1);
+    tab = malloc(sizeof(tab[0]) * n_symb);
+    for(i = 0; i < n_symb; i++)
+        tab[i] = i;
+    topk_sort(tab, n_symb, prob);
+
+    k = min_int(n_symb, k);
+    for(i = 0; i < k; i++) {
+        c = tab[i];
+        printf("%d: %10.3g '", i, prob[c]);
+        wp = &wl->words[c];
+        fwrite(wp->buf, 1, wp->len, stdout);
+        printf("'\n");
+    }
+    free(tab);
+#endif
+}
+
+char *trim_text(const char *str)
+{
+    size_t len;
+    char *new_str;
+    while (*str == ' ')
+        str++;
+    len = strlen(str);
+    while (len > 0 && str[len - 1] == ' ')
+        len--;
+    new_str = malloc(len + 1);
+    memcpy(new_str, str, len + 1);
+    return new_str;
+}
+
+TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model,
+                                                   const char *filename)
+{
+    WordList *wl;
+    TransformerModelParams p_s, *p = &p_s;
+    TransformerModel *s;
+    TextCompleteGlobalState *tcs;
+    char coefs_filename[128];
+    
+    tcs = nc_mallocz(sizeof(*tcs));
+    
+    trf_set_params(p, model);
+    if (!filename) {
+        snprintf(coefs_filename, sizeof(coefs_filename),
+                 "gpt2_%s.bin", gpt2_model_name[model]);
+        filename = coefs_filename;
+    }
+    s = trf_init(p, filename);
+    
+    wl = word_list_init();
+    word_load(wl, "gpt2vocab.txt");
+    tcs->wl = wl;
+    tcs->trf_state = s;
+    return tcs;
+}
+
+void text_complete_global_end(TextCompleteGlobalState *tcs)
+{
+    trf_end(tcs->trf_state);
+    word_list_end(tcs->wl);
+    nc_free(tcs);
+}
+
+TextGenContext *text_complete_start(TextCompleteGlobalState *tcs,
+                                    const char *input_text,
+                                    int top_k, float top_p, float temperature,
+                                    int seed, int max_output_len)
+{
+    TransformerModel *s = tcs->trf_state;
+    WordList *wl = tcs->wl;
+    TextGenContext *ts;
+    int i, mem_len;
+    
+    ts = nc_mallocz(sizeof(*ts));
+    ts->global_state = tcs;
+    ts->top_k = top_k;
+    ts->top_p = top_p;
+    ts->temperature = temperature;
+    rnd_init(&ts->rnd_state, seed);
+    ts->max_output_len = max_output_len;
+    ts->input_buf_len = gpt2_pp_encode_buf(wl, &ts->input_buf,
+                                           (const uint8_t *)input_text,
+                                           strlen(input_text));
+    if (ts->input_buf_len > MAX_INITIAL_TEXT_LEN) {
+        memmove(ts->input_buf, ts->input_buf + ts->input_buf_len - MAX_INITIAL_TEXT_LEN, MAX_INITIAL_TEXT_LEN * sizeof(ts->input_buf[0]));
+        ts->input_buf_len = MAX_INITIAL_TEXT_LEN;
+        ts->input_buf = realloc(ts->input_buf,
+                                ts->input_buf_len * sizeof(ts->input_buf[0]));
+    }
+
+#if 0
+    for(i = 0; i < ts->input_buf_len; i++) {
+        printf(" %04x", ts->input_buf[i]);
+    }
+    printf("\n");
+#endif
+
+    ts->mem_k = nc_mallocz(sizeof(ts->mem_k[0]) * s->n_layer);
+    ts->mem_v = nc_mallocz(sizeof(ts->mem_v[0]) * s->n_layer);
+    mem_len = ts->input_buf_len + max_output_len;
+    for(i = 0; i < s->n_layer; i++) {
+        ts->mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                        s->d_key, mem_len, s->n_head);
+        nc_tensor_set_name(ts->mem_k[i], "mem_k_%d", i);
+        ts->mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                        s->d_value, mem_len, s->n_head);
+        nc_tensor_set_name(ts->mem_v[i], "mem_v_%d", i);
+    }
+    ts->text_len = ts->input_buf_len;
+    ts->is_first = TRUE;
+    return ts;
+}
+
+static void text_complete_symb(TextCompleteGlobalState *tcs,
+                               TextGenContext *ts, NCTensor *logits)
+{
+    TransformerModel *s = tcs->trf_state;
+    WordList *wl = tcs->wl;
+    Word *wp;
+    NCTensorData xbuf, *x;
+    int c, out_len;
+    NCTensor *t0;
+
+    t0 = logits;
+    if (ts->temperature != 1.0)
+        t0 = nc_mul(t0, nc_new_f32(s->device, 1.0f / ts->temperature));
+    t0 = nc_soft_max(t0);
+    x = nc_tensor_get_data(&xbuf, t0);
+    
+    if (0) {
+        printf("\n");
+        dump_pred_symb((float *)x->data, s->n_symbols, 10, wl);
+    }
+    c = get_random_symb_topk((float *)x->data,
+                             s->n_symbols, ts->top_k, ts->top_p,
+                             &ts->rnd_state);
+    if (c == SYMB_EOT) {
+        ts->out_text_len = 0;
+        ts->out_text[0] = '\0';
+    } else {
+        wp = &wl->words[c];
+        out_len = min_int(sizeof(ts->out_text) - 1, wp->len);
+        memcpy(ts->out_text, wp->buf, out_len);
+        ts->out_text[out_len] = '\0';
+        ts->out_text_len = out_len;
+    }
+    ts->last_c = c;
+
+    nc_free_tensor(t0);
+}
+
+/* Note: ts_list is emptied */
+void text_complete_next(TextCompleteGlobalState *tcs,
+                        struct list_head *ts_list)
+{
+    TransformerModel *s = tcs->trf_state;
+    int i, k;
+    NCTensor *output, *input;
+    int32_t *ptr;
+    struct list_head *el, *el1;
+    TextGenContext *ts, **ts_tab;
+    int batch_size;
+    BatchEntry tab_mem[BATCH_SIZE_MAX];
+
+    list_for_each_safe(el, el1, ts_list) {
+        ts = list_entry(el, TextGenContext, link);
+        if (ts->text_len >= s->n_ctx ||
+            (ts->text_len - ts->input_buf_len) >= ts->max_output_len) {
+            ts->out_text_len = 0;
+            ts->out_text[0] = '\0';
+            list_del(&ts->link);
+        } else if (ts->is_first) {
+            input = nc_new_tensor_1d(s->device, NC_TYPE_I32, ts->text_len);
+            ptr = nc_tensor_get_ptr(input, NULL);
+            for(i = 0; i < ts->text_len; i++) {
+                ptr[i] = ts->input_buf[i];
+            }
+            
+            prof_start(PROF_EVAL);
+            tab_mem[0].mem_len = 0;
+            tab_mem[0].mem_k = ts->mem_k;
+            tab_mem[0].mem_v = ts->mem_v;
+            output = trf_eval(s, ts->text_len, 1, tab_mem, input);
+            prof_end(PROF_EVAL);
+            
+            text_complete_symb(tcs, ts, nc_slice_alias(output, 1, ts->text_len - 1, ts->text_len));
+            nc_free_tensor(output);
+            
+            ts->text_len++;
+            ts->is_first = FALSE;
+            list_del(&ts->link);
+        }
+    }
+
+    ts_tab = nc_mallocz(sizeof(ts_tab[0]) * BATCH_SIZE_MAX);
+    for(;;) {
+        k = 0;
+        list_for_each_safe(el, el1, ts_list) {
+            ts = list_entry(el, TextGenContext, link);
+            ts_tab[k++] = ts;
+            list_del(&ts->link);
+            if (k >= BATCH_SIZE_MAX)
+                break;
+        }
+        if (k == 0)
+            break;
+        batch_size = k;
+        //        printf("batch_size=%d\n", k);
+        
+        for(k = 0; k < batch_size; k++) {
+            ts = ts_tab[k];
+            tab_mem[k].mem_len = ts->text_len - 1;
+            tab_mem[k].mem_k = ts->mem_k;
+            tab_mem[k].mem_v = ts->mem_v;
+        }
+        
+        /* compute the next probabilities */
+        input = nc_new_tensor_1d(s->device, NC_TYPE_I32, batch_size);
+        ptr = nc_tensor_get_ptr(input, NULL);
+        for(k = 0; k < batch_size; k++) {
+            ts = ts_tab[k];
+            ptr[k] = ts->last_c;
+        }
+        
+        prof_start(PROF_EVAL);
+        output = trf_eval(s, 1, batch_size, tab_mem, input);
+        prof_end(PROF_EVAL);
+        
+        for(k = 0; k < batch_size; k++) {
+            ts = ts_tab[k];
+            text_complete_symb(tcs, ts,
+                               nc_slice_alias(output, 1, k, k + 1));
+            
+            ts->text_len++;
+            ts->is_first = FALSE;
+        }
+        nc_free_tensor(output);
+    }
+    nc_free(ts_tab);
+}
+
+void text_complete_end(TextGenContext *ts)
+{
+    TransformerModel *s = ts->global_state->trf_state;
+    int i;
+    
+    for(i = 0; i < s->n_layer; i++) {
+        nc_free_tensor(ts->mem_k[i]);
+        nc_free_tensor(ts->mem_v[i]);
+    }
+    nc_free(ts->mem_k);
+    nc_free(ts->mem_v);
+
+    free(ts->input_buf);
+    nc_free(ts);
+}
+
+void text_complete(GPT2ModelEnum model, const char *model_filename,
+                   const char *input_text,
+                   int top_k, float top_p, float temperature,
+                   int max_output_len, int batch_size, int seed,
+                   BOOL verbose)
+{
+    TextCompleteGlobalState *tcs;
+    TextGenContext *ts;
+    int count;
+    struct timeval tv;
+    const char *input_text1;
+    struct list_head ts_list;
+    int64_t ti;
+    
+    tcs = text_complete_global_init(model, model_filename);
+    
+    if (seed == 0) {
+        gettimeofday(&tv, NULL);
+        seed = tv.tv_sec + tv.tv_usec;
+    }
+
+    input_text1 = trim_text(input_text);
+    if (input_text1[0] == '\0')
+        input_text1 = strdup(" ");
+    printf("%s", input_text1);
+    fflush(stdout);
+    prof_start(PROF_TOTAL);
+    if (batch_size == 0) {
+        ts = text_complete_start(tcs, input_text1, top_k, top_p, temperature,
+                                 seed, max_output_len);
+        
+        ti = get_time_ms();
+        count = 0;
+        for(;;) {
+            init_list_head(&ts_list);
+            list_add_tail(&ts->link, &ts_list);
+            text_complete_next(tcs, &ts_list);
+            if (ts->out_text_len == 0)
+                break;
+            fwrite(ts->out_text, 1, ts->out_text_len, stdout);
+            fflush(stdout);
+            count++;
+        }
+        printf("\n");
+        text_complete_end(ts);
+    } else {
+        TextGenContext **ts_tab;
+        int i;
+
+        /* test for batch processing (the same text is generated by
+           each job) */
+        
+        ts_tab = nc_mallocz(sizeof(ts_tab[0]) * batch_size);
+        
+        for(i = 0; i < batch_size; i++) {
+            ts = text_complete_start(tcs, input_text1, top_k, top_p,
+                                     temperature, seed, max_output_len);
+            ts_tab[i] = ts;
+        }
+    
+        ti = get_time_ms();
+        count = 0;
+        for(;;) {
+            init_list_head(&ts_list);
+            for(i = 0; i < batch_size; i++) {
+                ts = ts_tab[i];
+                if (ts->is_first || ts->out_text_len > 0) {
+                    list_add_tail(&ts->link, &ts_list);
+                }
+            }
+            if (list_empty(&ts_list))
+                break;
+            text_complete_next(tcs, &ts_list);
+
+            for(i = 0; i < batch_size; i++) {
+                ts = ts_tab[i];
+                if (ts->out_text_len > 0 && i == 0) {
+                    fwrite(ts->out_text, 1, ts->out_text_len, stdout);
+                    fflush(stdout);
+                }
+            }
+            count++;
+        }
+        printf("\n");
+        
+        for(i = 0; i < batch_size; i++) {
+            ts = ts_tab[i];
+            text_complete_end(ts);
+        }
+        nc_free(ts_tab);
+    }
+    ti = get_time_ms() - ti;
+    if (verbose) {
+        printf("time=%0.1f word/s\n",
+               (double)count / ti * 1000);
+    }
+    prof_end(PROF_TOTAL);
+    text_complete_global_end(tcs);
+
+    nc_prof_dump();
+}
+
+/******************************************************************/
+/* short text compression */
+
+/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
+   are output. */
+int unicode_to_utf8(uint8_t *buf, unsigned int c)
+{
+    uint8_t *q = buf;
+
+    if (c < 0x80) {
+        *q++ = c;
+    } else {
+        if (c < 0x800) {
+            *q++ = (c >> 6) | 0xc0;
+        } else {
+            if (c < 0x10000) {
+                *q++ = (c >> 12) | 0xe0;
+            } else {
+                if (c < 0x00200000) {
+                    *q++ = (c >> 18) | 0xf0;
+                } else {
+                    if (c < 0x04000000) {
+                        *q++ = (c >> 24) | 0xf8;
+                    } else if (c < 0x80000000) {
+                        *q++ = (c >> 30) | 0xfc;
+                        *q++ = ((c >> 24) & 0x3f) | 0x80;
+                    } else {
+                        return 0;
+                    }
+                    *q++ = ((c >> 18) & 0x3f) | 0x80;
+                }
+                *q++ = ((c >> 12) & 0x3f) | 0x80;
+            }
+            *q++ = ((c >> 6) & 0x3f) | 0x80;
+        }
+        *q++ = (c & 0x3f) | 0x80;
+    }
+    return q - buf;
+}
+
+static const unsigned int utf8_min_code[5] = {
+    0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
+};
+
+static const unsigned char utf8_first_code_mask[5] = {
+    0x1f, 0xf, 0x7, 0x3, 0x1,
+};
+
+/* return -1 if error. *pp is not updated in this case. max_len must
+   be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
+int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
+{
+    int l, c, b, i;
+
+    c = *p++;
+    if (c < 0x80) {
+        *pp = p;
+        return c;
+    }
+    switch(c) {
+    case 0xc0 ... 0xdf:
+        l = 1;
+        break;
+    case 0xe0 ... 0xef:
+        l = 2;
+        break;
+    case 0xf0 ... 0xf7:
+        l = 3;
+        break;
+    case 0xf8 ... 0xfb:
+        l = 4;
+        break;
+    case 0xfc ... 0xfd:
+        l = 5;
+        break;
+    default:
+        return -1;
+    }
+    /* check that we have enough characters */
+    if (l > (max_len - 1))
+        return -1;
+    c &= utf8_first_code_mask[l - 1];
+    for(i = 0; i < l; i++) {
+        b = *p++;
+        if (b < 0x80 || b >= 0xc0)
+            return -1;
+        c = (c << 6) | (b & 0x3f);
+    }
+    if (c < utf8_min_code[l - 1])
+        return -1;
+    *pp = p;
+    return c;
+}
+
+static inline int simple_get_bit(const uint8_t *data, size_t index)
+{
+    return (data[index >> 3] >> (7 - (index & 7))) & 1;
+}
+
+static inline void simple_put_bit(uint8_t *data, size_t index, int bit)
+{
+    data[index >> 3] |= bit << (7 - (index & 7));
+}
+
+static uint16_t ranges[3][2] = {
+    { 0x3400, 0x4DB5 },
+    { 0x4e00, 0x9fcf },
+    { 0xAC00, 0xD7A3 },
+};
+
+static int c15_to_unicode(int c)
+{
+    int i, n, count;
+    for(i = 0; i < countof(ranges); i++) {
+        count = ranges[i][1] - ranges[i][0] + 1;
+        n = count;
+        if (c < n) {
+            return ranges[i][0] + c;
+        }
+        c -= count;
+    }
+    return -1;
+}
+
+static int unicode_to_c15(int c)
+{
+    int i, b;
+    b = 0;
+    for(i = 0; i < countof(ranges); i++) {
+        if (c >= ranges[i][0] && c <= ranges[i][1])
+            return b + c - ranges[i][0];
+        b += ranges[i][1] - ranges[i][0] + 1;
+    }
+    return -1;
+}
+
+size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits)
+{
+    size_t idx, out_buf_len;
+    int c, i, l, len;
+    char buf1[8], *out_buf;
+    
+    out_buf = malloc(4 * ((n_bits + 14) / 15) + 1);
+    out_buf_len = 0;
+    for(idx = 0; idx < n_bits; idx += 15) {
+        l = min_size_t(15, n_bits - idx);
+        c = 0;
+        for(i = 0; i < l; i++) {
+            c |= simple_get_bit(buf, idx + i) << (14 - i);
+        }
+        c = c15_to_unicode(c);
+        len = unicode_to_utf8((uint8_t *)buf1, c);
+        memcpy(out_buf + out_buf_len, buf1, len);
+        out_buf_len += len;
+    }
+    out_buf[out_buf_len] = '\0';
+    *pout_buf = out_buf;
+    return out_buf_len;
+}
+
+/* return -1 if error */
+ssize_t convert_from_chars(uint8_t **pout_buf, const char *str)
+{
+    const char *str_end;
+    int c, i;
+    uint8_t *out_buf;
+    size_t str_len, len;
+    
+    str_len = strlen(str);
+    str_end = str + str_len;
+    /* Note: the exact length of out_buf is smaller */
+    out_buf = malloc(str_len);
+    memset(out_buf, 0, str_len);
+    
+    len = 0;
+    while (*str != '\0') {
+        c = unicode_from_utf8((uint8_t *)str, str_end - str, (const uint8_t **)&str);
+        if (c < 0)
+            goto fail;
+        c = unicode_to_c15(c);
+        if (c < 0 || c >= 32768)
+            goto fail;
+        for(i = 0; i < 15; i++) {
+            simple_put_bit(out_buf, len * 15 + i, (c >> (14 - i)) & 1);
+        }
+        len++;
+    }
+    *pout_buf = out_buf;
+    return (len * 15 + 7) / 8;
+ fail:
+    free(out_buf);
+    return -1;
+}
+
+#define LENGTH_K 2
+
+int encode_length(PutBitState *pb, uint32_t val)
+{
+    uint32_t n, a, b, i;
+    a = val;
+    n = 1;
+    for(;;) {
+        b = 1 << (LENGTH_K * n);
+        if (a < b)
+            break;
+        n++;
+        a -= b;
+    }
+    for(i = 0; i < n - 1; i++)
+        put_bit_raw(pb, 0);
+    put_bit_raw(pb, 1);
+    for(i = 0; i < (LENGTH_K * n); i++) {
+        put_bit_raw(pb, (a >> (LENGTH_K * n - 1 - i)) & 1);
+    }
+    return n + LENGTH_K * n;
+}
+
+int decode_length(GetBitState *gb)
+{
+    int n, val, a, i;
+    n = 1;
+    a = 0;
+    for(;;) {
+        if (get_bit_raw(gb))
+            break;
+        if (n >= 10) /* arbitrary limit */
+            return -1;
+        a += 1 << (LENGTH_K * n);
+        n++;
+    }
+    val = 0;
+    for(i = 0; i < (LENGTH_K * n); i++) {
+        val |= get_bit_raw(gb) << (LENGTH_K * n - 1 - i);
+    }
+    return val + a;
+}
+
+static void realloc_buf(char **pbuf,
+                        size_t *psize, size_t len)
+{
+    size_t size = *psize;
+    if (len > size) {
+        size = max_size_t(len, size * 3 / 2);
+        *pbuf = realloc(*pbuf, sizeof(**pbuf) * size);
+        *psize = size;
+    }
+}
+
+
+#define CTEXT_LEN_MAX 256
+
+int text_decompress(TextCompleteGlobalState *tcs,
+                    char **poutput_text, const char *input_text)
+{
+    TransformerModel *s = tcs->trf_state;
+    WordList *wl = tcs->wl;
+    uint8_t *data_buf;
+    ssize_t data_buf_len, text_len, mem_len;
+    GetBitState gb_s, *gb = &gb_s;
+    BatchEntry tab_mem[1];
+    NCTensor **mem_k, **mem_v;
+    DataSymbol *text_buf;
+    NCTensorData xbuf, *x;
+    int c, i;
+    char *out_str;
+    size_t out_str_len, out_str_size;
+
+    *poutput_text = NULL;
+
+    /* XXX: handle zero length ? */
+    data_buf_len = convert_from_chars(&data_buf, input_text);
+    if (data_buf_len < 0)
+        return -1;
+    if (data_buf_len == 0) {
+        *poutput_text = strdup("");
+        free(data_buf);
+        return 0;
+    }
+#if 0
+    {
+        int i;
+        printf("data_buf=");
+        for(i = 0; i < data_buf_len; i++)
+            printf(" %02x", data_buf[i]);
+        printf("\n");
+    }
+#endif
+    get_bit_init(gb, data_buf, data_buf_len, NULL, NULL);
+
+    text_len = decode_length(gb);
+    if (text_len < 0 || text_len > CTEXT_LEN_MAX) {
+        free(data_buf);
+        return -1;
+    }
+    text_len++;
+
+    text_buf = nc_malloc(sizeof(text_buf[0]) * text_len);
+
+    mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer);
+    mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer);
+    mem_len = text_len;
+    for(i = 0; i < s->n_layer; i++) {
+        mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_key, mem_len, s->n_head);
+        nc_tensor_set_name(mem_k[i], "mem_k_%d", i);
+        mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_value, mem_len, s->n_head);
+        nc_tensor_set_name(mem_v[i], "mem_v_%d", i);
+    }
+    tab_mem[0].mem_k = mem_k;
+    tab_mem[0].mem_v = mem_v;
+
+    text_buf[0] = SYMB_EOT;
+
+    for(i = 0; i < text_len - 1; i++) {
+        NCTensor *t0, *input;
+        int32_t *ptr;
+
+        input = nc_new_tensor_1d(s->device, NC_TYPE_I32, 1);
+        ptr = nc_tensor_get_ptr(input, NULL);
+        ptr[0] = text_buf[i];
+        tab_mem[0].mem_len = i;
+        t0 = trf_eval(s, 1, 1, tab_mem, input);
+
+        t0 = nc_soft_max(t0);
+        x = nc_tensor_get_data(&xbuf, t0);
+        c = read_sym(gb, (float *)x->data, x->dims[0]);
+        text_buf[i + 1] = c;
+        nc_free_tensor(t0);
+    }
+
+    /* convert back to a string */
+    out_str = NULL;
+    out_str_len = 0;
+    out_str_size = 0;
+    for(i = 1; i < text_len; i++) {
+        Word *wp;
+        wp = &wl->words[text_buf[i]];
+        realloc_buf(&out_str, &out_str_size, out_str_len + wp->len);
+        memcpy(out_str + out_str_len, wp->buf, wp->len);
+        out_str_len += wp->len;
+    }
+    realloc_buf(&out_str, &out_str_size, out_str_len + 1);
+    out_str[out_str_len] = '\0';
+
+    for(i = 0; i < s->n_layer; i++) {
+        nc_free_tensor(mem_k[i]);
+        nc_free_tensor(mem_v[i]);
+    }
+    nc_free(mem_k);
+    nc_free(mem_v);
+    nc_free(text_buf);
+    free(data_buf);
+
+    *poutput_text = out_str;
+
+    return 0;
+}
+
+#define TEXT_OUTPUT_BUF_LEN 4096
+
+static void text_arith_write_buf(void *opaque, const uint8_t *buf, size_t buf_size)
+{
+    /* we assume the output is small enough to fit the buffer */
+}
+
+int text_compress(TextCompleteGlobalState *tcs,
+                  char **poutput_text,
+                  const char *input_text, BOOL dump_stats)
+{
+    TransformerModel *s = tcs->trf_state;
+    DataSymbol *input_buf;
+    int i, mem_len;
+    NCTensorData xbuf, *x;
+    double n_bits;
+    BatchEntry tab_mem[1];
+    NCTensor **mem_k, **mem_v, *output, *input;
+    PutBitState pb_s, *pb = &pb_s;
+    size_t input_buf_len, input_buf_size, out_buf_len;
+    uint8_t *out_buf;
+    char *out_str;
+    int32_t *ptr;
+
+    *poutput_text = NULL;
+
+    input_buf = NULL;
+    input_buf_size = 0;
+    input_buf_len = 0;
+
+    add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT);
+    gpt2_pp_encode_buf1(tcs->wl, &input_buf, &input_buf_size, &input_buf_len,
+                        (const uint8_t *)input_text,
+                        strlen(input_text));
+    if (input_buf_len > CTEXT_LEN_MAX) {
+        free(input_buf);
+        return -1;
+    }
+    if (input_buf_len == 1) {
+        free(input_buf);
+        *poutput_text = strdup("");
+        return 0;
+    }
+
+#if 0
+    for(i = 0; i < input_buf_len; i++) {
+        printf(" %04x", input_buf[i]);
+    }
+    printf("\n");
+#endif
+    prof_start(PROF_EVAL);
+    input = nc_new_tensor_1d(s->device, NC_TYPE_I32, input_buf_len);
+    ptr = nc_tensor_get_ptr(input, NULL);
+    for(i = 0; i < input_buf_len; i++) {
+        ptr[i] = input_buf[i];
+    }
+
+    mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer);
+    mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer);
+    mem_len = input_buf_len;
+    for(i = 0; i < s->n_layer; i++) {
+        mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_key, mem_len, s->n_head);
+        nc_tensor_set_name(mem_k[i], "mem_k_%d", i);
+        mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_value, mem_len, s->n_head);
+        nc_tensor_set_name(mem_v[i], "mem_v_%d", i);
+    }
+    tab_mem[0].mem_len = 0;
+    tab_mem[0].mem_k = mem_k;
+    tab_mem[0].mem_v = mem_v;
+
+    output = trf_eval(s, input_buf_len, 1, tab_mem, input);
+    prof_end(PROF_EVAL);
+
+    out_buf = malloc(TEXT_OUTPUT_BUF_LEN);
+    put_bit_init(pb, out_buf, TEXT_OUTPUT_BUF_LEN, text_arith_write_buf, NULL);
+
+    n_bits = encode_length(pb, input_buf_len - 1);
+
+    for(i = 0; i < input_buf_len - 1; i++) {
+        double v;
+        NCTensor *t0;
+        t0 = nc_soft_max(nc_slice_alias(output, 1, i, i + 1));
+        x = nc_tensor_get_data(&xbuf, t0);
+        write_sym(pb, (float *)x->data, x->dims[0], input_buf[i + 1]);
+        v = -log2(((float *)x->data)[input_buf[i + 1]]);
+        //        printf("%d: %0.1f\n", i, v);
+        nc_free_tensor(t0);
+        n_bits += v;
+    }
+    nc_free_tensor(output);
+    out_buf_len = put_bit_flush(pb);
+#if 0
+    {
+        printf("out_buf=");
+        for(i = 0; i < (out_buf_len + 7) / 8; i++)
+            printf(" %02x", out_buf[i]);
+        printf("\n");
+    }
+#endif
+    convert_to_chars(&out_str, out_buf, out_buf_len);
+    if (dump_stats) {
+        printf("%d chars, %" PRId64 " symbols, %" PRId64 " bits (ref=%0.1f bits) (%d compressed chars)\n",
+               (int)strlen(input_text),
+               (int64_t)input_buf_len,
+               (int64_t)out_buf_len,
+               n_bits,
+               (int)((out_buf_len + 14) / 15));
+    }
+
+    free(out_buf);
+    free(input_buf);
+    for(i = 0; i < s->n_layer; i++) {
+        nc_free_tensor(mem_k[i]);
+        nc_free_tensor(mem_v[i]);
+    }
+    nc_free(mem_k);
+    nc_free(mem_v);
+    *poutput_text = out_str;
+    return 0;
+}
+
+void text_compress_test(GPT2ModelEnum model, const char *model_filename,
+                        const char *input_text,
+                        BOOL is_decode, BOOL verbose)
+{
+    TextCompleteGlobalState *tcs;
+    char *out_str;
+
+    tcs = text_complete_global_init(model, model_filename);
+
+    if (is_decode) {
+        if (text_decompress(tcs, &out_str, input_text) < 0) {
+            printf("Error\n");
+        } else {
+            printf("%s\n", out_str);
+        }
+        free(out_str);
+    } else {
+        if (text_compress(tcs, &out_str, input_text, verbose) < 0) {
+            printf("Error\n");
+        } else {
+            printf("%s\n", out_str);
+        }
+        free(out_str);
+    }
+    text_complete_global_end(tcs);
+}
+
+/*************************************************/
+/* file compression */
+
+static uint8_t *load_file(size_t *psize, const char *filename)
+{
+    FILE *f;
+    size_t size;
+    uint8_t *buf;
+
+    f = fopen(filename, "rb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+    fseek(f, 0, SEEK_END);
+    size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf = malloc(size + 1);
+    if (fread(buf, 1, size, f) != size) {
+        fprintf(stderr, "%s: I/O error\n", filename);
+        exit(1);
+    }
+    buf[size] = '\0';
+    fclose(f);
+    *psize = size;
+    return buf;
+}
+
+/* check if CRLF can be converted to LF losslessly */
+static BOOL check_lossless_crlf(const uint8_t *buf, size_t len)
+{
+    size_t i;
+    BOOL has_crlf;
+    has_crlf = FALSE;
+    for(i = 0; i < len - 1;) {
+        if (buf[i] == '\r' && buf[i + 1] == '\n') {
+            has_crlf = TRUE;
+            i += 2;
+        } else if (buf[i] == '\n') {
+            return FALSE;
+        } else {
+            i++;
+        }
+    }
+    return has_crlf;
+}
+
+static size_t convert_crlf_to_lf(uint8_t *buf, size_t len)
+{
+    size_t i, j;
+    j = 0;
+    for(i = 0; i < len - 1;) {
+        if (buf[i] == '\r' && buf[i + 1] == '\n')
+            i++;
+        buf[j++] = buf[i++];
+    }
+    if (i < len)
+        buf[j++] = buf[i++];
+    return j;
+}
+
+#define ARITH_BUF_LEN 65536
+
+static void arith_write_buf(void *opaque, const uint8_t *buf, size_t buf_size)
+{
+    FILE *f = opaque;
+    fwrite(buf, 1, buf_size, f);
+}
+
+/* XXX: should use a large batch size */
+int file_compress(TextCompleteGlobalState *tcs,
+                  const char *infilename, const char *outfilename)
+{
+    TransformerModel *s = tcs->trf_state;
+    DataSymbol *input_buf;
+    int i, mem_len, len;
+    NCTensorData xbuf, *x;
+    BatchEntry tab_mem[1];
+    NCTensor **mem_k, **mem_v, *output, *input;
+    PutBitState pb_s, *pb = &pb_s;
+    size_t input_buf_len, input_buf_size, input_text_len;
+    int64_t n_output_bits;
+    size_t input_buf_pos;
+    uint8_t *input_text, *arith_buf;
+    FILE *f;
+    BOOL convert_crlf;
+    int32_t *ptr;
+
+    input_text = load_file(&input_text_len, infilename);
+
+    convert_crlf = check_lossless_crlf(input_text, input_text_len);
+    //    printf("convert_crlf=%d\n", convert_crlf);
+
+    if (convert_crlf) {
+        input_text_len = convert_crlf_to_lf(input_text, input_text_len);
+    }
+
+    input_buf = NULL;
+    input_buf_size = 0;
+    input_buf_len = 0;
+
+    add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT);
+    gpt2_pp_encode_buf1(tcs->wl, &input_buf, &input_buf_size, &input_buf_len,
+                        input_text, input_text_len);
+    add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT);
+
+#if 0
+    for(i = 0; i < input_buf_len; i++) {
+        printf(" %04x", input_buf[i]);
+    }
+    printf("\n");
+#endif
+    prof_start(PROF_EVAL);
+    mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer);
+    mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer);
+    for(i = 0; i < s->n_layer; i++) {
+        mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_key, s->n_ctx, s->n_head);
+        nc_tensor_set_name(mem_k[i], "mem_k_%d", i);
+        mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_value, s->n_ctx, s->n_head);
+        nc_tensor_set_name(mem_v[i], "mem_v_%d", i);
+    }
+
+    f = fopen(outfilename, "wb");
+    if (!f) {
+        perror(outfilename);
+        exit(1);
+    }
+
+    arith_buf = nc_malloc(ARITH_BUF_LEN);
+    put_bit_init(pb, arith_buf, ARITH_BUF_LEN, arith_write_buf, f);
+
+    put_bit_raw(pb, convert_crlf);
+
+    mem_len = 0;
+    input_buf_pos = 0;
+    while (input_buf_pos < (input_buf_len - 1)) {
+        len = min_size_t(input_buf_len - 1 - input_buf_pos, s->n_ctx - mem_len);
+        printf("%5.1f%%   \r", (double)input_buf_pos / (double)input_buf_len * 100);
+        fflush(stdout);
+        //        printf("pos=%d mem_len=%d len=%d\n", (int)input_buf_pos, mem_len, len);
+
+        input = nc_new_tensor_1d(s->device, NC_TYPE_I32, mem_len + len);
+        ptr = nc_tensor_get_ptr(input, NULL);
+        for(i = 0; i < mem_len + len; i++) {
+            ptr[i] = input_buf[input_buf_pos - mem_len + i];
+        }
+        tab_mem[0].mem_len = 0;
+        tab_mem[0].mem_k = mem_k;
+        tab_mem[0].mem_v = mem_v;
+
+        output = trf_eval(s, mem_len + len, 1, tab_mem, input);
+
+        for(i = 0; i < len; i++) {
+            NCTensor *t0;
+            t0 = nc_soft_max(nc_slice_alias(output, 1, mem_len + i,
+                                            mem_len + i + 1));
+            x = nc_tensor_get_data(&xbuf, t0);
+            write_sym(pb, (float *)x->data,
+                      x->dims[0], input_buf[input_buf_pos + i + 1]);
+            nc_free_tensor(t0);
+        }
+        nc_free_tensor(output);
+
+        input_buf_pos += len;
+        mem_len = min_int(mem_len + len, s->n_ctx / 2);
+    }
+
+    prof_end(PROF_EVAL);
+
+    n_output_bits = put_bit_flush(pb);
+
+    printf("-> %" PRId64 " bytes\n", (n_output_bits + 7) / 8);
+    fclose(f);
+    nc_free(arith_buf);
+
+    free(input_buf);
+    for(i = 0; i < s->n_layer; i++) {
+        nc_free_tensor(mem_k[i]);
+        nc_free_tensor(mem_v[i]);
+    }
+    nc_free(mem_k);
+    nc_free(mem_v);
+    return 0;
+}
+
+int file_decompress(TextCompleteGlobalState *tcs,
+                    const char *infilename, const char *outfilename)
+{
+    TransformerModel *s = tcs->trf_state;
+    WordList *wl = tcs->wl;
+    uint8_t *data_buf;
+    ssize_t data_buf_len;
+    GetBitState gb_s, *gb = &gb_s;
+    BatchEntry tab_mem[1];
+    NCTensor **mem_k, **mem_v, *input, *t0;
+    DataSymbol *text_buf;
+    NCTensorData xbuf, *x;
+    Word *wp;
+    int c, i, pos;
+    FILE *f;
+    BOOL convert_crlf;
+    int32_t *ptr;
+
+    data_buf = load_file((size_t *)&data_buf_len, infilename);
+#if 0
+    {
+        int i;
+        printf("data_buf=");
+        for(i = 0; i < data_buf_len; i++)
+            printf(" %02x", data_buf[i]);
+        printf("\n");
+    }
+#endif
+    get_bit_init(gb, data_buf, data_buf_len, NULL, NULL);
+
+    convert_crlf = get_bit_raw(gb);
+
+    text_buf = nc_malloc(sizeof(text_buf[0]) * s->n_ctx);
+
+    mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer);
+    mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer);
+    for(i = 0; i < s->n_layer; i++) {
+        mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_key, s->n_ctx, s->n_head);
+        nc_tensor_set_name(mem_k[i], "mem_k_%d", i);
+        mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32,
+                                    s->d_value, s->n_ctx, s->n_head);
+        nc_tensor_set_name(mem_v[i], "mem_v_%d", i);
+    }
+    tab_mem[0].mem_k = mem_k;
+    tab_mem[0].mem_v = mem_v;
+
+    text_buf[0] = SYMB_EOT;
+
+    f = fopen(outfilename, "wb");
+    if (!f)
+        perror(outfilename);
+
+    pos = 0;
+    for(;;) {
+        input = nc_new_tensor_1d(s->device, NC_TYPE_I32, 1);
+        ptr = nc_tensor_get_ptr(input, NULL);
+        ptr[0] = text_buf[pos];
+        tab_mem[0].mem_len = pos;
+        t0 = trf_eval(s, 1, 1, tab_mem, input);
+        t0 = nc_soft_max(t0);
+        x = nc_tensor_get_data(&xbuf, t0);
+        c = read_sym(gb, (float *)x->data, x->dims[0]);
+        nc_free_tensor(t0);
+        if (c == SYMB_EOT)
+            break;
+        wp = &wl->words[c];
+        if (convert_crlf) {
+            for(i = 0; i < wp->len; i++) {
+                if (wp->buf[i] == '\n')
+                    fputc('\r', f);
+                fputc(wp->buf[i], f);
+            }
+        } else {
+            fwrite(wp->buf, 1, wp->len, f);
+        }
+        fflush(f);
+        pos++;
+        if (pos >= s->n_ctx) {
+            int n;
+            /* buffer full: restart with the last n_ctx / 2 symbols */
+            n = s->n_ctx / 2;
+            for(i = 0; i < n; i++)
+                text_buf[i] = text_buf[pos - n + i];
+
+            input = nc_new_tensor_1d(s->device, NC_TYPE_I32, n);
+            ptr = nc_tensor_get_ptr(input, NULL);
+            for(i = 0; i < n; i++)
+                ptr[i] = text_buf[i];
+            tab_mem[0].mem_len = 0;
+            t0 = trf_eval(s, n, 1, tab_mem, input);
+            nc_free_tensor(t0);
+            pos = n;
+        }
+        text_buf[pos] = c;
+    }
+
+    fclose(f);
+
+    for(i = 0; i < s->n_layer; i++) {
+        nc_free_tensor(mem_k[i]);
+        nc_free_tensor(mem_v[i]);
+    }
+    nc_free(mem_k);
+    nc_free(mem_v);
+    nc_free(text_buf);
+    free(data_buf);
+
+    return 0;
+}
diff --git a/gpt2/gpt2tc.h b/gpt2/gpt2tc.h
new file mode 100644
index 0000000..a110569
--- /dev/null
+++ b/gpt2/gpt2tc.h
@@ -0,0 +1,143 @@
+#ifndef _GPT2TC_H
+#define _GPT2TC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "cutils.h"
+#include "arith.h"
+#include "cp_utils.h"
+#include "list.h"
+#include "libnc.h"
+
+#define MAX_INITIAL_TEXT_LEN 256 /* in symbols */
+#define MAX_OUTPUT_LEN 100
+#define DEFAULT_TOP_K 40
+#define DEFAULT_TOP_P 0.9
+#define BATCH_SIZE_MAX 16
+//#define BATCH_SIZE_MAX 1
+
+
+typedef uint16_t DataSymbol;
+
+typedef enum {
+    GPT2_MODEL_117M,
+    GPT2_MODEL_345M,
+    GPT2_MODEL_774M,
+    GPT2_MODEL_1558M,
+} GPT2ModelEnum;
+
+typedef struct {
+    BOOL is_decoder;
+    int n_layer;
+    int d_model;
+    int n_head;
+    int d_key;
+    int d_value;
+    int d_inner;
+    int n_ctx;
+    int n_symbols;
+    uint32_t seed;
+} TransformerModelParams;
+
+typedef struct {
+    NCTensor *ln_1_g, *ln_1_b;
+    NCTensor *attn_w, *attn_b;
+    NCTensor *attn_proj_w, *attn_proj_b;
+
+    NCTensor *ln_2_g, *ln_2_b;
+    NCTensor *mlp_fc_w, *mlp_fc_b;
+    NCTensor *mlp_proj_w, *mlp_proj_b;
+} TransformerLayer;
+
+typedef struct {
+    RNDState rnd_state;
+    NCContext *model;
+    NCDevice *device;
+    int n_layer;
+    int d_model;
+    int n_head;
+    int d_key;
+    int d_value;
+    int d_inner;
+    int n_symbols;
+    int n_ctx;
+
+    /* parameters */
+    NCParamList param_list;
+    TransformerLayer *layers;
+    NCTensor *wte, *wpe, *wte_trans;
+    NCTensor *ln_f_g, *ln_f_b;
+} TransformerModel;
+
+typedef struct Word {
+    uint32_t next; /* -1 = end */
+    uint32_t len;
+    uint8_t *buf;
+} Word;
+
+typedef struct {
+    Word *words;
+    size_t word_count;
+    size_t word_size;
+    uint32_t *hash_table;
+    int hash_size;
+    int hash_bits;
+} WordList;
+
+typedef struct {
+    TransformerModel *trf_state;
+    WordList *wl;
+} TextCompleteGlobalState;
+
+typedef struct {
+    struct list_head link;
+    TextCompleteGlobalState *global_state;
+    int top_k;
+    float top_p;
+    float temperature;
+    RNDState rnd_state;
+    NCTensor **mem_k, **mem_v;
+    DataSymbol *input_buf;
+    int input_buf_len;
+    int text_len; /* current input text len */
+    BOOL is_first;
+    int last_c;
+    int max_output_len;
+
+    /* output */
+    char out_text[1024];
+    int out_text_len; /* 0 means end of output */
+} TextGenContext;
+
+GPT2ModelEnum parse_model(const char *str);
+void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model);
+void gpt2_pp_encode(const char *word_filename, const char *in_filename, const char *out_filename);
+size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf, const uint8_t *buf, size_t buf_size);
+void gpt2_pp_decode(const char *word_filename, const char *in_filename, const char *out_filename);
+char *trim_text(const char *str);
+TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model, const char *filename);
+void text_complete_global_end(TextCompleteGlobalState *tcs);
+TextGenContext *text_complete_start(TextCompleteGlobalState *tcs, const char *input_text, int top_k, float top_p, float temperature, int seed, int max_output_len);
+void text_complete_next(TextCompleteGlobalState *tcs, struct list_head *ts_list);
+void text_complete_end(TextGenContext *ts);
+void text_complete(GPT2ModelEnum model, const char *model_filename, const char *input_text, int top_k, float top_p, float temperature, int max_output_len, int batch_size, int seed, BOOL verbose);
+int unicode_to_utf8(uint8_t *buf, unsigned int c);
+int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
+size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits);
+ssize_t convert_from_chars(uint8_t **pout_buf, const char *str);
+int encode_length(PutBitState *pb, uint32_t val);
+int decode_length(GetBitState *gb);
+int text_decompress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text);
+int text_compress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text, BOOL dump_stats);
+void text_compress_test(GPT2ModelEnum model, const char *model_filename, const char *input_text, BOOL is_decode, BOOL verbose);
+int file_compress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
+int file_decompress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/gpt2/gpt2vocab.txt b/gpt2/gpt2vocab.txt
new file mode 100644
index 0000000..62c74b2
Binary files /dev/null and b/gpt2/gpt2vocab.txt differ
diff --git a/gpt2/libnc.h b/gpt2/libnc.h
new file mode 100644
index 0000000..8a478c5
--- /dev/null
+++ b/gpt2/libnc.h
@@ -0,0 +1,426 @@
+/*
+ * LibNC
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ */
+#ifndef LIBNC_H
+#define LIBNC_H
+
+#include <inttypes.h>
+#include "cutils.h"
+#include "list.h"
+
+/* profiling */
+
+typedef enum {
+    PROF_EVAL,
+    PROF_GRAD,
+    PROF_SGD,
+    PROF_UPDATE,
+    PROF_WRITE_SYM,
+    PROF_PROBE,
+    PROF_TOTAL,
+    PROF_COUNT,
+} ProfEnum;
+
+#ifdef PROFILE
+
+extern int64_t prof_cycles[PROF_COUNT];
+extern int64_t prof_samples[PROF_COUNT];
+extern int64_t prof_ops[PROF_COUNT];
+
+static inline void prof_start(int idx)
+{
+    prof_cycles[idx] -= get_cycles();
+}
+
+static inline void prof_end(int idx)
+{
+    prof_cycles[idx] += get_cycles();
+    prof_samples[idx]++;
+}
+
+static inline void prof_end_ops(int idx, int n_ops)
+{
+    prof_cycles[idx] += get_cycles();
+    prof_ops[idx] += n_ops;
+    prof_samples[idx]++;
+}
+
+#else
+
+static inline void prof_start(int idx)
+{
+}
+
+static inline void prof_end(int idx)
+{
+}
+
+static inline void prof_end_ops(int idx, int n_ops)
+{
+}
+
+#endif
+
+void nc_prof_dump(void);
+
+/* Automatic Differentiation Engine */
+
+typedef struct NCContext NCContext;
+typedef struct NCDevice NCDevice;
+typedef struct NCTensor NCTensor;
+typedef struct NCTensorBuffer NCTensorBuffer;
+typedef struct NCNode NCNode;
+typedef struct NCRNDState NCRNDState;
+typedef struct NCSGDOptState NCSGDOptState;
+
+typedef enum {
+    NC_TYPE_F32,
+    NC_TYPE_BF16,
+    NC_TYPE_F16,
+    NC_TYPE_I8,
+    NC_TYPE_I16,
+    NC_TYPE_I32,
+    NC_TYPE_COUNT,
+} NCTypeEnum;
+
+extern size_t nc_type_size_table[NC_TYPE_COUNT];
+extern const char *nc_type_name_table[NC_TYPE_COUNT];
+
+#define NC_N_DIMS_MAX 4 /* maximum number of axis for tensors */
+
+typedef struct NCTensorData {
+    NCTypeEnum item_type;
+    size_t item_size;
+    void *data;
+    size_t stride; /* in elements */
+    size_t n_strides; /* prod(j = 1 ... n_dims, dims[j]); */
+    int n_dims;
+    const size_t *dims; /* n_dims length */
+    const size_t *strides; /* n_dims length, strides in bytes */
+} NCTensorData;
+
+void *nc_malloc(size_t size);
+void *nc_mallocz(size_t size);
+void nc_free(void *ptr);
+
+NCContext *nc_context_init(int nb_threads);
+void nc_context_end(NCContext *m);
+
+NCDevice *nc_new_cpu_device(NCContext *m);
+NCDevice *nc_new_cuda_device(NCContext *m, int device_index);
+NCDevice *nc_new_device(NCContext *m, const char *device_name);
+void nc_synchronize(NCDevice *d);
+
+NCTensorBuffer *nc_new_tensor_buffer(NCDevice *d, size_t size);
+NCTensorBuffer *nc_dup_tensor_buffer(const NCTensorBuffer *b);
+void nc_free_tensor_buffer(NCTensorBuffer *b);
+
+NCTensor *nc_new_tensor(NCDevice *d, NCTypeEnum type,
+                        int n_dims, const size_t *dims);
+NCTensor *nc_new_tensor_from_tensor(const NCTensor *x);
+NCTensor *nc_new_tensor_from_tensor_nz(const NCTensor *x);
+NCTensor *nc_new_scalar(NCDevice *d, NCTypeEnum type);
+NCTensor *nc_new_tensor_1d(NCDevice *d, NCTypeEnum type, size_t len);
+NCTensor *nc_new_tensor_2d(NCDevice *d, NCTypeEnum type, size_t n0, size_t n1);
+NCTensor *nc_new_tensor_3d(NCDevice *d, NCTypeEnum type,
+                           size_t n0, size_t n1, size_t n2);
+NCTensor *nc_new_tensor_4d(NCDevice *d, NCTypeEnum type,
+                           size_t n0, size_t n1, size_t n2, size_t n3);
+NCTensor *__attribute__((format(printf, 2, 3))) nc_tensor_set_name(NCTensor *x, const char *fmt, ...);
+NCTensor *nc_dup_tensor(const NCTensor *x);
+void nc_free_tensor(NCTensor *x);
+void nc_dump_tensor(const char *name, NCTensor *x, size_t n);
+uint32_t nc_tensor_get_hash(NCTensor *x);
+void nc_dump_tensor_hash(const char *name, const NCTensor *x);
+NCNode *nc_get_node(NCTensor *x);
+/* create an alias to tensor 'x1'. Gradient is not propagated thru it */
+NCTensor *nc_slice_alias(const NCTensor *x1, int axis, size_t start, size_t end);
+
+NCTypeEnum nc_tensor_get_item_type(const NCTensor *x);
+NCTensorData *nc_tensor_get_data(NCTensorData *sd, const NCTensor *x);
+/* Return a pointer to the tensor data. If *pstride is non NULL,
+   return the stride (in elements) of the first dimension. */
+void *nc_tensor_get_ptr(NCTensor *x, size_t *pstride);
+const size_t *nc_tensor_get_dims(const NCTensor *x, int *pn_dims);
+void nc_tensor_set_zero(NCTensor *y);
+void nc_tensor_set_f32(NCTensor *y, float val);
+NCRNDState *nc_rnd_init(NCDevice *d, uint32_t seed);
+void nc_rnd_end(NCRNDState *s);
+void nc_tensor_set_rnd_unif(NCTensor *y, float avg, float range,
+                            NCRNDState *rnd_state);
+void nc_tensor_set_dropout(NCTensor *y, float prob, NCRNDState *rnd_state);
+
+void nc_set1_i32(NCTensor *y, int n_dims, const size_t *tab_indexes,
+                 int32_t val);
+void nc_set1_i32_1d(NCTensor *y, size_t i0, int32_t val);
+void nc_set1_i32_2d(NCTensor *y, size_t i0, size_t i1, int32_t val);
+void nc_set1_f32(NCTensor *y, int n_dims, const size_t *tab_indexes,
+                 float val);
+void nc_set1_f32_1d(NCTensor *y, size_t i0, float val);
+
+int32_t nc_get1_i32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
+float nc_get1_f32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
+float nc_get1_f32_1d(const NCTensor *x, size_t i0);
+float nc_get_scalar_f32(const NCTensor *x);
+
+void nc_tensor_copy(NCTensor *dst, NCTensor *src);
+void nc_tensor_convert(NCTensor *dst, NCTensor *src);
+
+void nc_dump_dims(const char *str, NCTensor *x);
+size_t nc_get_heap_size(NCContext *m);
+NCContext *nc_get_tensor_context(const NCTensor *x);
+NCTensor *nc_tensor_to_device(NCTensor *x, NCDevice *d);
+NCTensor *nc_tensor_to_cpu_device(NCTensor *x);
+NCDevice *nc_get_tensor_device(const NCTensor *x);
+                                 
+/* element wise operations */
+NCTensor *nc_convert(NCTensor *x, NCTypeEnum new_type);
+NCTensor *nc_add(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_neg(NCTensor *x);
+NCTensor *nc_sub(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_mul(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_div(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_recip(NCTensor *x);
+NCTensor *nc_min(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_max(NCTensor *x1, NCTensor *x2);
+/* select x1[i] if z[i] = 0 and x2[i] otherwise */
+NCTensor *nc_select(NCTensor *z, NCTensor *x1, NCTensor *x2);
+/* set y[i] = x1[i] if mask[i] = 0 and y[i] = c if mask[i] != 0. If
+   mask_inv is TRUE, 'mask' is inverted */
+NCTensor *nc_masked_fill(NCTensor *x, NCTensor *mask, float c, BOOL mask_inv);
+NCTensor *nc_sigmoid(NCTensor *x);
+NCTensor *nc_tanh(NCTensor *x);
+NCTensor *nc_relu(NCTensor *x);
+NCTensor *nc_gelu(NCTensor *x);
+NCTensor *nc_log(NCTensor *x);
+/* return cp * fg + min(1 - fg, ig) * in */
+NCTensor *nc_lstm_clamped(NCTensor *cp, NCTensor *in,
+                          NCTensor *fg, NCTensor *ig);
+/* return a * (1 - t) + b * t */
+NCTensor *nc_lerp(NCTensor *a, NCTensor *b, NCTensor *t);
+
+/* other operations */
+NCTensor *nc_new_vec_f32(NCDevice *d, size_t n, float val);
+NCTensor *nc_new_f32(NCDevice *d, float val);
+NCTensor *nc_reshape(NCTensor *x, int n_dims, const size_t *dims);
+NCTensor *nc_reshape_1d(NCTensor *x, size_t n0);
+NCTensor *nc_reshape_2d(NCTensor *x, size_t n0, size_t n1);
+NCTensor *nc_reshape_3d(NCTensor *x, size_t n0, size_t n1, size_t n2);
+NCTensor *nc_reshape_4d(NCTensor *x, size_t n0, size_t n1, size_t n2,
+                        size_t n3);
+/* duplicate the tensor by adding n_dims dimensions */
+NCTensor *nc_repeat(NCTensor *x, int n_dims, const size_t *dims);
+NCTensor *nc_repeat_1d(NCTensor *x, size_t n);
+/* return y0 + sum over the dimensions > n_dims of 'x'. y0 = NULL
+   is supported */
+NCTensor *nc_reduce_sum(NCTensor *y0, NCTensor *x, int n_dims);
+/* sum all the elements of a tensor */
+NCTensor *nc_sum(NCTensor *x);
+/* sum of squares */
+NCTensor *nc_reduce_sum_sqr(NCTensor *x);
+NCTensor *nc_slice(NCTensor *x, int axis, size_t start, size_t end);
+NCTensor *nc_slice_add(NCTensor *y0, NCTensor *x, int axis, size_t start);
+/* concatenation along axis 'axis' */
+NCTensor *nc_concat(NCTensor **inputs, int n_inputs, int axis);
+/* shortcut for axis = 0 */
+NCTensor *nc_vconcat(NCTensor **inputs, int n_inputs);
+/* shortcut for axis = 1 */
+NCTensor *nc_hconcat(NCTensor **inputs, int n_inputs);
+/* split along axis 'axis'. If tab_size = NULL, split equally. */
+void nc_split(NCTensor **tab_y, NCTensor *x, int n_outputs,
+              const size_t *tab_size, int axis);
+/* shortcut for axis = 0 */
+void nc_vsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
+               const size_t *tab_size);
+/* shortcut for axis = 1 */
+void nc_hsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
+               const size_t *tab_size);
+
+typedef enum {
+    NC_PAD_ZERO,
+    NC_PAD_DUP, /* duplicate element */
+    /* trim types, dual to padding */
+    NC_TRIM_NORMAL = NC_PAD_ZERO,
+    NC_TRIM_SUM, /* add trimmed elements to the edge */
+} NCPadEnum;
+
+/* pad (len > 0) or trim (len < 0) the axis 0 of 'x' */
+NCTensor *nc_pad(NCTensor *x, ssize_t left_len, NCPadEnum left_op,
+                 ssize_t right_len, NCPadEnum right_op);
+/* shortcut to nc_pad() */
+NCTensor *nc_resize(NCTensor *x, size_t n);
+
+/* if x is not contiguous then create a new contiguous tensor and copy
+   x to it. Otherwise, return 'x'. */
+NCTensor *nc_make_contiguous(NCTensor *x);
+/* Return a new tensor sharing the same buffer as 'x' with the permuted
+   dimensions. axis[i] is the corresponding axis in 'x' */
+NCTensor *nc_permute_alias(NCTensor *x, int n_dims, const int *axis);
+/* same as nc_permute_alias but calls nc_make_contiguous after. */
+NCTensor *nc_permute(NCTensor *x, int n_dims, const int *axis);
+/* special case of nc_permute() */
+NCTensor *nc_transpose(NCTensor *x);
+NCTensor *nc_matmul(NCTensor *w, NCTensor *x);
+/* return w*x + y0. w and x can be optionally transposed. y0 can be NULL */
+NCTensor *nc_matmul_add(NCTensor *w, NCTensor *x, NCTensor *y0,
+                        BOOL w_trans, BOOL x_trans);
+NCTensor *nc_matmul_stride(NCTensor *w, NCTensor *x);
+/* return a matrix where each column is the column x[i] of matrix 'w' */
+NCTensor *nc_get_col(NCTensor *w, NCTensor *x);
+/* add the vectors 'z' at column number 'x' in matrix 'w'. */
+NCTensor *nc_add_col(NCTensor *z, NCTensor *x, NCTensor *w);
+/* select the x-th element in each column of 'w' */
+NCTensor *nc_get_element(NCTensor *w, NCTensor *x);
+/* add z to the x-th element in each column of 'w' */
+NCTensor *nc_add_element(NCTensor *z, NCTensor *x, NCTensor *w);
+NCTensor *nc_soft_max(NCTensor *x);
+/* Equivalent to y = log(get_element(x, eout)). It is expected to be
+   used as nc_index_log(nc_soft_max(x), eout) so that the gradient
+   computation is optimized. */
+NCTensor *nc_indexed_log(NCTensor *x, NCTensor *eout);
+NCTensor *nc_layer_norm(NCTensor *x, float eps);
+NCTensor *nc_rms_norm(NCTensor *x, float eps);
+NCTensor *nc_slt_mat_set(NCTensor *x, size_t pos, float c);
+/* shift the column 'i' by 'pos + i * mult' elements and pad with with zeros */
+NCTensor *nc_rel_shift(NCTensor *x, ssize_t pos, ssize_t mult);
+
+/* auto differentiation */
+
+/* get_col_index is non NULL in the sparse gradient case */
+typedef void NCParamUpdateFunc(void *opaque, NCTensor *grad,
+                               NCTensor *get_col_index);
+
+/* add a 'parameter' graph node to 'x' and return 'x'. */
+NCTensor *nc_set_param(NCTensor *x, void *opaque);
+/* return a new tensor with its graph removed */
+NCTensor *nc_stop_grad(NCTensor *x);
+
+/* manipulation of graph nodes */
+NCNode *nc_dup_node(const NCNode *n);
+void nc_free_node(NCNode *n);
+void nc_combine_nodes(NCContext *m, NCNode **tab_op1, int count,
+                      int axis, int elem_size, const size_t *tab_elem_size);
+NCNode *nc_concat_node(NCContext *m, NCNode **inputs, int count,
+                       int axis, const size_t *tab_size);
+void nc_concat_optimization(NCContext *m, NCNode **concat_nodes, int count);
+void nc_node_set_parent(NCNode *n, int arg_index, const NCNode *n1);
+void nc_node_set_arg(NCNode *n, int arg_index, const NCTensor *x);
+
+#define NC_BW_KEEP_GRAD_GRAPH (1 << 0)
+/* optimize the nc_get_col() gradient */
+#define NC_BW_SPARSE_GRAD     (1 << 1)
+
+void nc_backward(const NCTensor *x, NCTensor *grad,
+                 NCParamUpdateFunc *param_update_func, int flags);
+void nc_dump_graph(NCTensor *x);
+
+/* utilities for function parameters */
+
+typedef struct {
+    struct list_head link;
+    NCTensor **pval; /* pointer to the tensor location */
+    char *name; /* parameter name */
+    NCTensor *low_part; /* if BF16 parameter, additional 16 bit precision */
+    NCTensor *saved_grad; /* debug */
+    /* SGD opt data */
+    struct SGDOptVarState *sgd_opt;
+} NCParam;
+
+typedef struct {
+    struct list_head param_list;
+    BOOL add_graph;
+} NCParamList;
+
+void nc_param_list_init(NCParamList *pl);
+void nc_param_list_set_graph(NCParamList *pl, BOOL add_graph);
+NCParam *nc_new_param_str(NCParamList *pl, NCTensor **pval, const char *str);
+__attribute__((format(printf, 3, 4))) NCParam *nc_new_param(NCParamList *pl, NCTensor **pval, const char *fmt, ...);
+void nc_param_list_end(NCParamList *pl);
+
+NCParam *nc_find_param(NCParamList *pl, const char *name);
+size_t nc_get_param_count(NCParamList *pl);
+void nc_save_coefs(NCParamList *pl, const char *filename);
+void nc_load_coefs(NCParamList *pl, const char *filename);
+void nc_save_state(NCParamList *pl, const char *filename);
+void nc_load_state(NCParamList *pl, const char *filename);
+
+/* SGD optimizer */
+
+typedef enum {
+    SGD_OPT_BASIC,
+    SGD_OPT_ADAM,
+    SGD_OPT_TEST,
+} SGDOptAlgoEnum;
+
+typedef struct {
+    SGDOptAlgoEnum algo;
+    union {
+        struct {
+            float beta1;
+            float beta2;
+            float eps;
+            float gradient_clip; /* if != 0, per parameter gradient clipping */
+        } adam;
+    } u;
+    float lr;
+} SGDOptParams;
+
+NCSGDOptState *nc_sgd_opt_init(NCContext *m, const SGDOptParams *p);
+void nc_sgd_opt_end(NCSGDOptState *s);
+void sgd_opt_update_var(void *opaque, NCTensor *yg, NCTensor *get_col_index);
+
+/* set the SGD optimizer 's' to all parameters of the model */
+void nc_sgd_opt_set_all(NCParamList *param_list, NCSGDOptState *s);
+
+/* set the SGD optimizer 's' to the variable 'x'. Remove it if s = NULL */
+void nc_sgd_opt_set(NCParam *x, NCSGDOptState *s);
+void nc_sgd_opt_update(NCSGDOptState *s);
+/* force the learning rate */
+void nc_sgd_opt_set_lr(NCSGDOptState *s, float lr);
+float nc_sgd_opt_get_lr(NCSGDOptState *s);
+
+/* for SGD_OPT_TEST */
+NCTensor *nc_sgd_opt_get_grad(NCParam *p);
+
+/* misc utilities (to be removed) */
+
+typedef struct {
+    uint32_t seed;
+    /* used by Gaussian generator */
+    int idx;
+    float y1;
+} RNDState;
+
+typedef struct {
+    uint16_t u16;
+} nc_float16_t;
+
+void rnd_init(RNDState *s, uint32_t seed);
+uint32_t rnd_unif_u32(RNDState *s);
+float rnd_unif(RNDState *s);
+void rnd_unif_vec(float *tab, size_t n, float mu, float range,
+                  RNDState *s);
+void rnd_unif_mat(float *tab, size_t stride, size_t h, size_t w,
+                  float mu, float sigma, RNDState *s);
+
+float vec_sum_f32(const float *tab, size_t n);
+
+typedef struct  {
+    float val;
+    uint32_t idx;
+} NCTopKEntry;
+
+/* Return the k largest values among prob[0...n_symb-1] such that k is
+   the largest value such that k <= topk and sum(i=0 .. k - 2,
+   prob[tab[i]]) < topp.
+
+   It is assumed that prob[i] >= 0. The function returns (k, tab,
+   sum). 'sum' is the sum of the k returned values. 'tab' must be
+   freed with nc_free(). */
+int nc_topk(NCTopKEntry **ptab, double *psum,
+            const float *prob, size_t n, int topk, float topp);
+
+#endif /* LIBNC_H */
diff --git a/gpt2/list.h b/gpt2/list.h
new file mode 100644
index 0000000..9ceddef
--- /dev/null
+++ b/gpt2/list.h
@@ -0,0 +1,96 @@
+/*
+ * Linux klist like system
+ * 
+ * Copyright (c) 2016-2017 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef LIST_H
+#define LIST_H
+
+struct list_head {
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+#define LIST_HEAD_INIT(el) { &(el), &(el) }
+
+/* return the pointer of type 'type *' containing 'el' as field 'member' */
+#define list_entry(el, type, member) \
+    ((type *)((uint8_t *)(el) - offsetof(type, member)))
+
+static inline void init_list_head(struct list_head *head)
+{
+    head->prev = head;
+    head->next = head;
+}
+
+/* insert 'el' between 'prev' and 'next' */
+static inline void __list_add(struct list_head *el, 
+                              struct list_head *prev, struct list_head *next)
+{
+    prev->next = el;
+    el->prev = prev;
+    el->next = next;
+    next->prev = el;
+}
+
+/* add 'el' at the head of the list 'head' (= after element head) */
+static inline void list_add(struct list_head *el, struct list_head *head)
+{
+    __list_add(el, head, head->next);
+}
+
+/* add 'el' at the end of the list 'head' (= before element head) */
+static inline void list_add_tail(struct list_head *el, struct list_head *head)
+{
+    __list_add(el, head->prev, head);
+}
+
+static inline void list_del(struct list_head *el)
+{
+    struct list_head *prev, *next;
+    prev = el->prev;
+    next = el->next;
+    prev->next = next;
+    next->prev = prev;
+    el->prev = NULL; /* fail safe */
+    el->next = NULL; /* fail safe */
+}
+
+static inline int list_empty(struct list_head *el)
+{
+    return el->next == el;
+}
+
+#define list_for_each(el, head) \
+  for(el = (head)->next; el != (head); el = el->next)
+
+#define list_for_each_safe(el, el1, head)                \
+    for(el = (head)->next, el1 = el->next; el != (head); \
+        el = el1, el1 = el->next)
+
+#define list_for_each_prev(el, head) \
+  for(el = (head)->prev; el != (head); el = el->prev)
+
+#define list_for_each_prev_safe(el, el1, head)           \
+    for(el = (head)->prev, el1 = el->prev; el != (head); \
+        el = el1, el1 = el->prev)
+
+#endif /* LIST_H */
diff --git a/gpt2/readme.txt b/gpt2/readme.txt
new file mode 100644
index 0000000..3f95757
--- /dev/null
+++ b/gpt2/readme.txt
@@ -0,0 +1,86 @@
+GPT-2 text completion and compression demo
+==========================================
+
+1) Usage
+--------
+
+Extract the 117M GPT-2 model to the gpt2tc directory:
+
+tar xtf gpt2tc-117M.tar.gz
+
+Text completion example:
+
+./gpt2tc g "Hello, my name is"
+
+Use more CPU cores (only faster on server CPUs):
+
+./gpt2tc -T 4 g "Hello, my name is"
+
+Short Text compression and decompression example:
+
+./gpt2tc cs "Hello, how are you ?"
+
+./gpt2tc ds "姯敳痪"
+
+Text compression example:
+
+./gpt2tc c in.txt out.bin
+
+Decompression:
+
+./gpt2tc d out.bin out.txt
+
+2) Using larger models
+----------------------
+
+The smallest GPT-2 model (117M) is provided in a separate
+archive. Larger models can be built by downloading the TensorFlow
+parameters and converting them with the attached script. Example:
+
+# download the model to models/345M
+./download_model.sh 345M
+
+# convert it to the gpt2tc format:
+python3 gpt2convert.py models/345M gpt2_345M.bin
+
+# use it
+./gpt2tc -m 345M g "Hello, how are you ?"
+
+3) Compression results
+----------------------
+
+File          Model  Original size Compr. size  Ratio  CMIX v18
+            #params        (bytes)     (bytes)  (bpb)  ratio (bpb)
+book1          117M         768771      152283   1.58  1.82
+book1          345M         768771      142183   1.48
+book1          774M         768771      137562   1.43
+book1         1558M         768771      134217   1.40
+
+alice29.txt    117M         152089       23615   1.24  1.65
+alice29.txt    345M         152089       20587   1.08
+alice29.txt    774M         152089       19096   1.00
+alice29.txt   1558M         152089       17382   0.91
+
+enwik5         117M         100000       14875   1.19  1.60
+enwik5         345M         100000       13511   1.08
+enwik5         774M         100000       13240   1.06
+enwik5        1558M         100000       12918   1.03
+
+Notes:
+- book1 comes from the Calgary corpus.
+- alice29.txt comes from the Canterbury corpus.
+- enwik5 contains the first 100000 bytes of the English
+  Wikipedia dump of March 3, 2006
+  (http://mattmahoney.net/dc/textdata.html).
+- For best performance, use the UTF-8 encoding and don't mix CRLF and
+  LF line breaks.
+- For reference, the results of CMIX
+  (http://www.byronknoll.com/cmix.html) are provided.
+
+4) More information
+-------------------
+
+This demo has no external dependency. It is written in C and uses the
+LibNC library for tensor manipulation. The CPU must support AVX2.
+
+A similar program is used for http://textsynth.org/
diff --git a/justlm.hpp b/justlm.hpp
new file mode 100644
index 0000000..4347ca7
--- /dev/null
+++ b/justlm.hpp
@@ -0,0 +1,54 @@
+#ifndef LLM_H
+#define LLM_H
+#include <iostream>
+#include <string>
+#include <vector>
+#include <functional>
+#include <memory>
+#include <thread>
+
+
+class LLM {
+    struct {
+        int32_t seed; // RNG seed
+        int32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 2;
+        union {
+            int32_t n_ctx; // Context size, llama.cpp specific
+            int32_t n_prompt = -1; // Prompt size, gpt2 specific
+        };
+        int32_t n_batch = 8; // Batch size, unused
+
+        int32_t top_k = 40;
+        float   top_p = 0.5f;
+        float   temp  = 0.72f;
+    } params;
+
+    struct State *state;
+
+    void init(const std::string& weights_path);
+
+    static
+    bool ends_with(std::string_view str, std::string_view suffix);
+
+public:
+    struct Exception : public std::runtime_error {
+        using std::runtime_error::runtime_error;
+    };
+    struct ContextLengthException : public Exception {
+        ContextLengthException() : Exception("Max. context length exceeded") {}
+    };
+
+    LLM(const std::string& weights_path, int32_t seed = 0) {
+        // Set random seed
+        params.seed = seed?seed:time(NULL);
+
+        // Initialize llm
+        init(weights_path);
+    }
+    ~LLM();
+
+    void append(std::string_view prompt, const std::function<bool (float progress)>& on_tick = nullptr);
+
+    std::string run(std::string_view end, const std::function<bool (const char *generated)>& on_tick = nullptr);
+};
+#endif // LLM_H
diff --git a/libjustlm_core.cpp b/libjustlm_core.cpp
new file mode 100644
index 0000000..493313c
--- /dev/null
+++ b/libjustlm_core.cpp
@@ -0,0 +1,9 @@
+#include "justlm.hpp"
+
+#include <string_view>
+
+
+
+bool LLM::ends_with(std::string_view str, std::string_view suffix) {
+    return str.size() >= suffix.size() && 0 == str.compare(str.size()-suffix.size(), suffix.size(), suffix);
+}
diff --git a/libjustlm_gpt2.cpp b/libjustlm_gpt2.cpp
new file mode 100644
index 0000000..c29f42c
--- /dev/null
+++ b/libjustlm_gpt2.cpp
@@ -0,0 +1,80 @@
+#include "justlm.hpp"
+#include "gpt2/gpt2tc.h"
+
+#include <filesystem>
+#include <cstring>
+
+
+struct State {
+    std::string prompt;
+    std::string model_path;
+    GPT2ModelEnum model;
+} state;
+
+
+
+void LLM::init(const std::string& weights_path) {
+    state->model_path = weights_path;
+    // Get weight file size
+    auto weights_size = std::filesystem::file_size(weights_path);
+    // Determine weight size
+    switch (weights_size) {
+    case 250700242: state->model = GPT2_MODEL_117M; break;
+    case 3120522738: state->model = GPT2_MODEL_1558M; break;
+    case 712396722: state->model = GPT2_MODEL_345M; break;
+    case 1551900050: state->model = GPT2_MODEL_774M; break;
+    default: throw Exception("Unknown model size");
+    }
+}
+
+LLM::~LLM() {
+    delete state;
+}
+
+void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
+    state->prompt.append(prompt);
+    std::cout << prompt << std::endl;
+}
+
+std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
+    std::string fres;
+    TextCompleteGlobalState *tcs;
+    TextGenContext *ts;
+    int count;
+    struct timeval tv;
+    struct list_head ts_list;
+
+    // Initialize completion
+    tcs = text_complete_global_init(state->model, state->model_path.c_str());
+
+    // Run completion
+    ts = text_complete_start(tcs, state->prompt.c_str(), params.top_k, params.top_p, params.temp,
+                             params.seed, params.n_prompt>0?params.n_prompt:0xfffffff - state->prompt.size());
+    bool abort = false;
+    while (!abort && !ends_with(fres, end)) {
+        // Run completion
+        init_list_head(&ts_list);
+        list_add_tail(&ts->link, &ts_list);
+        text_complete_next(tcs, &ts_list);
+        if (ts->out_text_len == 0)
+            break;
+        auto str = std::string_view{ts->out_text, static_cast<std::string_view::size_type>(ts->out_text_len)};
+
+        // Append result to fres
+        fres.append(str);
+
+        // Tick
+        if (on_tick && !on_tick(std::string(str).c_str()) /*Huge overhead in favor of llama.cpp*/) abort = true;
+    }
+    // End completion
+    text_complete_end(ts);
+
+    text_complete_global_end(tcs);
+
+    // Create final string  TODO: Could be optimized
+    state->prompt.append(fres);
+    fres = std::string(fres.data(), fres.size()-end.size());
+
+    // Return final string
+    return fres;
+}
diff --git a/libjustlm_llama.cpp b/libjustlm_llama.cpp
new file mode 100644
index 0000000..c3e2981
--- /dev/null
+++ b/libjustlm_llama.cpp
@@ -0,0 +1,115 @@
+#include "justlm.hpp"
+
+#include <ggml.h>
+#include <llama.h>
+
+
+struct State {
+    llama_context *ctx = nullptr;
+    std::string prompt;
+    std::vector<int> embd;
+    int n_ctx;
+    std::string last_result;
+} state;
+
+
+
+void LLM::init(const std::string& weights_path) {
+    // Allocate state
+    state = new State;
+
+    // Get llama parameters
+    auto lparams = llama_context_default_params();
+    lparams.seed = params.seed;
+    lparams.n_ctx = params.n_ctx>0?params.n_ctx:2024;
+
+    // Create context
+    state->ctx = llama_init_from_file(weights_path.c_str(), lparams);
+    if (!state->ctx) {
+        throw Exception("Failed to initialize llama from file");
+    }
+
+    // Initialize some variables
+    state->n_ctx = llama_n_ctx(state->ctx);
+}
+
+LLM::~LLM() {
+    if (state->ctx) llama_free(state->ctx);
+    delete state;
+}
+
+void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
+    // Check if prompt was empty
+    const bool was_empty = state->prompt.empty();
+
+    // Append to current prompt
+    state->prompt.append(prompt);
+
+    // Resize buffer for tokens
+    const auto old_token_count = state->embd.size();
+    state->embd.resize(old_token_count+state->prompt.size()+1);
+
+    // Run tokenizer
+    const auto token_count = llama_tokenize(state->ctx, prompt.data(), state->embd.data()+old_token_count, state->embd.size()-old_token_count, was_empty);
+    state->embd.resize(old_token_count+token_count);
+
+    // Make sure limit is far from being hit
+    if (state->embd.size() > state->n_ctx-6) {
+        // Yup. *this MUST be decomposed now.
+        throw ContextLengthException();
+    }
+
+    // Evaluate new tokens
+    // TODO: Larger batch size
+    std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state->embd.size() << '/' << state->n_ctx << std::endl;
+    for (int it = old_token_count; it != state->embd.size(); it++) {
+        std::cout << llama_token_to_str(state->ctx, state->embd.data()[it]) << std::flush;
+        llama_eval(state->ctx, state->embd.data()+it, 1, it, params.n_threads);
+
+        // Tick
+        if (on_tick) {
+            // Calculate progress
+            auto progress = float(it-old_token_count) / (state->embd.size()-old_token_count) * 100.f;
+            // Run callback
+            if (!on_tick(progress)) break;
+        }
+    }
+    std::cout << std::endl;
+}
+
+std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
+    std::string fres;
+
+    // Loop until done
+    bool abort = false;
+    while (!abort && !ends_with(fres, end)) {
+        // Sample top p and top k
+        const auto id = llama_sample_top_p_top_k(state->ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);
+
+        // Add token
+        state->embd.push_back(id);
+
+        // Get token as string
+        const auto str = llama_token_to_str(state->ctx, id);
+
+        // Debug
+        std::cout << str << std::flush;
+
+        // Append string to function result
+        fres.append(str);
+
+        // Evaluate token
+        //  TODO: Respect batch size
+        llama_eval(state->ctx, state->embd.data()+state->embd.size()-1, 1, state->embd.size()-1, params.n_threads);
+
+        // Tick
+        if (on_tick && !on_tick(str)) abort = true;
+    }
+
+    // Create final string  TODO: Could be optimized
+    state->prompt.append(fres);
+    fres = std::string(fres.data(), fres.size()-end.size());
+
+    // Return final string
+    return fres;
+}
diff --git a/llama.cpp b/llama.cpp
new file mode 160000
index 0000000..9cbc404
--- /dev/null
+++ b/llama.cpp
@@ -0,0 +1 @@
+Subproject commit 9cbc404ba6699a9ba4925ea25a60552b13491c7a
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..94f5868
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,12 @@
+#include "ai.hpp"
+
+#include <iostream>
+
+
+
+int main() {
+    Ai ai;
+    std::cout << "Completing \"she replied that\"..." << std::endl;
+    std::cout << "Using model " << ai.model_name << "..." << std::endl;
+    std::cout << "> she replied that" << ai.complete("she replied that", '\n') << std::endl;
+}