Initial commit

2025-03-06 20:49:17 +01:00 · 2023-03-30 07:03:33 -05:00 · 2023-03-30 07:03:33 -05:00 · aaddcc0cbd
commit aaddcc0cbd
21 changed files with 4037 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,74 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+CMakeLists.txt.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(libjustlm LANGUAGES C CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_library(libjustlm STATIC
+  libjustlm_core.cpp
+  justlm.hpp
+)
+
+set(LM_BACKEND "llama.cpp" CACHE STRING "The language model backend to use")
+
+if (LM_BACKEND STREQUAL "libnc gpt2")
+    add_library(libjustlm_gpt2 STATIC libjustlm_gpt2.cpp gpt2/arith.c gpt2/cp_utils.c gpt2/gpt2tc.c)
+    target_link_libraries(libjustlm_gpt2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gpt2/libnc.so pthread)
+elseif (LM_BACKEND STREQUAL "llama.cpp")
+    add_subdirectory(llama.cpp)
+    add_library(libjustlm_llama STATIC libjustlm_llama.cpp)
+    target_link_libraries(libjustlm_llama PRIVATE llama)
+else()
+    message(FATAL_ERROR "LM_BACKEND '${LM_BACKEND}' is unsupported. Please use either 'libnc gpt2' or 'llama.cpp'.")
+endif()
--- a/gpt2/VERSION
+++ b/gpt2/VERSION
@ -0,0 +1 @@
+2021-04-24
--- a/gpt2/arith.c
+++ b/gpt2/arith.c
@ -0,0 +1,301 @@
+/*
+ * Arithmetic coder
+ * 
+ * Copyright (c) 2018-2021 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <time.h>
+#include <getopt.h>
+
+#include "cutils.h"
+#include "arith.h"
+
+#define RANGE_MIN_BITS 16
+#define RANGE_MIN ((0xff << (RANGE_MIN_BITS - 8)) + 1)
+#define RANGE_MAX (0xff << RANGE_MIN_BITS)
+
+//#define DUMP_PUT_BIT
+//#define DUMP_GET_BIT
+
+void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
+                  PutBitWriteFunc *write_func, void *opaque)
+{
+    s->low = 0;
+    s->range = RANGE_MAX;
+    s->current_byte = 0xff;
+    s->n_bytes = 0;
+    s->buf = buf;
+    s->buf_size = buf_size;
+    s->idx = 0;
+    s->write_func = write_func;
+    s->opaque = opaque;
+    s->byte_count = 0;
+    assert(PROB_UNIT <= RANGE_MIN);
+}
+
+static void put_byte(PutBitState *s, int v)
+{
+    s->buf[s->idx++] = v;
+    if (unlikely(s->idx == s->buf_size)) {
+        s->byte_count += s->idx;
+        s->write_func(s->opaque, s->buf, s->idx);
+        s->idx = 0;
+    }
+}
+
+/* 0 <= v <= 0x1fe. The current output stream contains n_bytes with:
+   current_byte, then (n_bytes - 1) x 0xff
+ */
+static void put_val(PutBitState *s, int v)
+{
+    uint32_t carry, b;
+
+#ifdef DUMP_PUT_BIT
+    printf("  out=%d\n", v);
+#endif
+    if (v == 0xff) {
+        s->n_bytes++;
+    } else {
+        if (s->n_bytes > 0) {
+            carry = v >> 8;
+            put_byte(s, s->current_byte + carry);
+            b = (0xff + carry) & 0xff;
+            while (s->n_bytes > 1) {
+                put_byte(s, b);
+                s->n_bytes--;
+            }
+        }
+        s->n_bytes = 1;
+        s->current_byte = v;
+    }
+}
+
+static void put_val_flush(PutBitState *s)
+{
+    if (s->n_bytes > 0) {
+        put_val(s, 0);
+    }
+}
+
+static void put_bit_renorm(PutBitState *s)
+{
+    uint32_t v;
+    /* after renormalisation:
+       0 <= low <= RANGE_MAX
+       RANGE_MIN <= range <= RANGE_MAX
+       In the worst case before normalisation:
+       low_max = 2 * RANGE_MAX hence v <= 0x1fe
+    */
+    while (s->range < RANGE_MIN) {
+        v = s->low >> RANGE_MIN_BITS;
+        put_val(s, v);
+        s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
+        s->range <<= 8;
+    }
+}
+
+/* 0 < prob0 < PROB_UNIT */
+void put_bit(PutBitState *s, int prob0, int bit)
+{
+    int range0;
+
+    assert(s->range >= RANGE_MIN);
+    range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
+    assert(range0 > 0);
+    assert(range0 < s->range);
+#if defined(DUMP_PUT_BIT)
+    {
+        static int count;
+        printf("%d: range=%d b=%d range0=%d low=%d\n",
+               count++, s->range, bit, range0, s->low);
+    }
+#endif
+    if (!bit) {
+        s->range = range0;
+    } else {
+        s->low += range0;
+        s->range -= range0;
+    }
+    
+    put_bit_renorm(s);
+}
+
+void put_bit_raw(PutBitState *s, int bit)
+{
+    int range0;
+    
+    assert(s->range >= RANGE_MIN);
+    range0 = s->range >> 1;
+    if (!bit) {
+        s->range = range0;
+    } else {
+        s->low += range0;
+        s->range -= range0;
+    }
+    
+    put_bit_renorm(s);
+}
+
+/* return the minimum number of bits to be able to correctly decode */
+int64_t put_bit_flush(PutBitState *s)
+{
+    int n, val, mask;
+
+    /* force larger range */
+    if (s->range < (1 << RANGE_MIN_BITS)) {
+        put_val(s, s->low >> RANGE_MIN_BITS);
+        s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
+        s->range <<= 8;
+    }
+
+    /* largest n such as 2^n <= range */
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    assert(n >= RANGE_MIN_BITS && n <= (RANGE_MIN_BITS + 7));
+
+    val = s->low;
+    mask = (1 << n) - 1;
+    if ((val & mask) != 0)
+        val = (val + (1 << n)) & ~mask;
+    assert(val >= s->low && val < s->low + s->range);
+
+    put_val(s, val >> RANGE_MIN_BITS);
+    put_val_flush(s);
+    if (s->idx > 0) {
+        s->byte_count += s->idx;
+        s->write_func(s->opaque, s->buf, s->idx);
+        s->idx = 0;
+    }
+    return (s->byte_count - 1) * 8 + (RANGE_MIN_BITS + 8 - n);
+}
+
+/* return the approximate number of written bits */
+int64_t put_bit_get_bit_count(PutBitState *s)
+{
+    int n;
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    return (s->byte_count + s->idx) * 8 + (RANGE_MIN_BITS + 7 - n);
+}
+
+/****************************************/
+
+static void refill(GetBitState *s)
+{
+    s->range <<= 8;
+    s->low <<= 8;
+    if (s->idx >= s->buf_len) {
+        if (!s->read_func)
+            return; /* pad with zeros */
+        s->buf_len = s->read_func(s->opaque, s->buf, s->buf_size);
+        s->byte_count += s->buf_len;
+        s->idx = 0;
+    }
+#ifdef DUMP_GET_BIT
+    printf("  in=%d\n", s->buf[s->idx]);
+#endif
+    s->low += s->buf[s->idx++];
+}
+
+void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
+                  GetBitReadFunc *read_func, void *opaque)
+{
+    int i;
+    s->buf_size = buf_size;
+    s->buf = buf;
+    s->read_func = read_func;
+    s->opaque = opaque;
+    if (read_func) {
+        s->buf_len = 0;
+    } else {
+        /* prefilled buffer */
+        s->buf_len = s->buf_size;
+    }
+    s->byte_count = s->buf_len;
+    s->range = 0;
+    s->low = 0;
+    s->idx = 0;
+    for(i = 0; i <= RANGE_MIN_BITS; i += 8) {
+        refill(s);
+    }
+    s->range = RANGE_MAX;
+}
+
+/* 0 < prob0 < PROB_UNIT */
+int get_bit(GetBitState *s, int prob0)
+{
+    int b, range0;
+
+    assert(s->range >= RANGE_MIN);
+    range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
+    assert(range0 > 0);
+    assert(range0 < s->range);
+    b = s->low >= range0;
+#ifdef DUMP_GET_BIT
+    {
+        static int count;
+        printf("%d: range=%d b=%d range0=%d low=%d\n", count++, s->range, b, range0, s->low);
+    }
+#endif
+    if (b) {
+        s->low -= range0;
+        s->range -= range0;
+    } else {
+        s->range = range0;
+    }
+    while (s->range < RANGE_MIN)
+        refill(s);
+    return b;
+}
+
+/* no context */
+int get_bit_raw(GetBitState *s)
+{
+    int b, range0;
+    range0 = s->range >> 1;
+    b = s->low >= range0;
+    if (b) {
+        s->low -= range0;
+        s->range -= range0;
+    } else {
+        s->range = range0;
+    }
+    if (s->range < RANGE_MIN)
+        refill(s);
+    return b;
+}
+
+/* return the approximate number of read bits */
+int64_t get_bit_get_bit_count(GetBitState *s)
+{
+    int n;
+    n = 0;
+    while ((1 << (n + 1)) <= s->range)
+        n++;
+    return (s->byte_count - s->buf_len + s->idx) * 8 - n;
+}
--- a/gpt2/arith.h
+++ b/gpt2/arith.h
@ -0,0 +1,73 @@
+/*
+ * Arithmetic coder
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef ARITH_H
+#define ARITH_H
+
+#define PROB_UNIT_BITS 15
+#define PROB_UNIT (1 << PROB_UNIT_BITS)
+
+typedef void PutBitWriteFunc(void *opaque, const uint8_t *buf, size_t buf_size);
+
+typedef struct {
+    uint32_t range;
+    uint32_t low;
+    uint8_t current_byte;
+    uint32_t n_bytes;
+    uint8_t *buf;
+    size_t buf_size;
+    size_t idx; /* current position in bytes */
+    PutBitWriteFunc *write_func;
+    void *opaque;
+    uint64_t byte_count;
+} PutBitState;
+
+void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
+                  PutBitWriteFunc *write_func, void *opaque);
+void put_bit(PutBitState *s, int prob0, int bit);
+void put_bit_raw(PutBitState *s, int bit);
+int64_t put_bit_flush(PutBitState *s);
+int64_t put_bit_get_bit_count(PutBitState *s);
+
+/* return the number of read bytes */
+typedef ssize_t GetBitReadFunc(void *opaque, uint8_t *buf, size_t buf_size);
+
+typedef struct {
+    uint8_t *buf;
+    int buf_len;
+    int buf_size;
+    int idx;
+    uint32_t low;
+    uint32_t range;
+    GetBitReadFunc *read_func;
+    void *opaque;
+    uint64_t byte_count;
+} GetBitState;
+
+void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
+                  GetBitReadFunc *read_func, void *opaque);
+int get_bit(GetBitState *s, int prob0);
+int get_bit_raw(GetBitState *s);
+int64_t get_bit_get_bit_count(GetBitState *s);
+
+#endif /* ARITH_H */
--- a/gpt2/cp_utils.c
+++ b/gpt2/cp_utils.c
@ -0,0 +1,316 @@
+/*
+ * Compression utilities
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <time.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#ifdef _WIN32
+#include <direct.h>
+#endif
+
+#include "cutils.h"
+#include "libnc.h"
+#include "cp_utils.h"
+
+void fatal_error(const char *fmt, ...)
+{
+    va_list ap;
+    
+    va_start(ap, fmt);
+    fprintf(stderr, "Fatal error: ");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    exit(1);
+}
+
+int64_t get_time_ms(void)
+{
+#ifdef _WIN32
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (int64_t)tv.tv_sec * 1000 + (tv.tv_usec / 1000U);
+#else
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000 + (ts.tv_nsec / 1000000U);
+#endif
+}
+
+void fput_u8(FILE *f, uint8_t v)
+{
+    fputc(v, f);
+}
+
+int fget_u8(FILE *f, uint8_t *pv)
+{
+    int c;
+    c = fgetc(f);
+    if (c < 0)
+        return -1;
+    *pv = c;
+    return 0;
+}
+
+void fput_be16(FILE *f, uint16_t v)
+{
+    fputc(v >> 8, f);
+    fputc(v >> 0, f);
+}
+
+int fget_be16(FILE *f, uint16_t *pv)
+{
+    uint8_t buf[2];
+    if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
+        return -1;
+    *pv = (buf[0] << 8) |
+        (buf[1] << 0);
+    return 0;
+}
+
+void fput_be32(FILE *f, uint32_t v)
+{
+    fputc(v >> 24, f);
+    fputc(v >> 16, f);
+    fputc(v >> 8, f);
+    fputc(v >> 0, f);
+}
+
+int fget_be32(FILE *f, uint32_t *pv)
+{
+    uint8_t buf[4];
+    if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
+        return -1;
+    *pv = (buf[0] << 24) |
+        (buf[1] << 16) |
+        (buf[2] << 8) |
+        (buf[3] << 0);
+    return 0;
+}
+
+void fput_sgd_opt(FILE *f, const SGDOptParams *p)
+{
+    fput_u8(f, p->algo);
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        break;
+    case SGD_OPT_ADAM:
+        fput_f32(f, p->u.adam.beta1);
+        fput_f32(f, p->u.adam.beta2);
+        fput_f32(f, p->u.adam.eps);
+        fput_f32(f, p->u.adam.gradient_clip);
+        break;
+    default:
+        abort();
+    }
+}
+
+int fget_sgd_opt(FILE *f, SGDOptParams *p)
+{
+    uint8_t v8;
+    
+    if (fget_u8(f, &v8))
+        return -1;
+    p->algo = v8;
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        break;
+    case SGD_OPT_ADAM:
+        if (fget_f32(f, &p->u.adam.beta1))
+            return -1;
+        if (fget_f32(f, &p->u.adam.beta2))
+            return -1;
+        if (fget_f32(f, &p->u.adam.eps))
+            return -1;
+        if (fget_f32(f, &p->u.adam.gradient_clip))
+            return -1;
+        break;
+    default:
+        return -1;
+    }
+    return 0;
+}
+
+void dump_sgd_opt_params(FILE *f, const SGDOptParams *p)
+{
+    switch(p->algo) {
+    case SGD_OPT_BASIC:
+        fprintf(f, " sgd_opt=%s", 
+               "none");
+        break;
+    case SGD_OPT_ADAM:
+        fprintf(f, " sgd_opt=%s beta1=%g beta2=%g eps=%g gclip=%g",
+               "adam",
+                p->u.adam.beta1,
+                p->u.adam.beta2,
+                p->u.adam.eps,
+                p->u.adam.gradient_clip);
+        break;
+    default:
+        abort();
+    }
+}
+
+typedef union {
+    float f;
+    uint32_t u32;
+} f32;
+
+void fput_f32(FILE *f, float v)
+{
+    f32 u;
+    u.f = v;
+    fput_be32(f, u.u32);
+}
+
+int fget_f32(FILE *f, float *pv)
+{
+    f32 u;
+    if (fget_be32(f, &u.u32))
+        return -1;
+    *pv = u.f;
+    return 0;
+}
+
+void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym)
+{
+    int start, range, prob0, bit, range0;
+    float p, p0;
+    
+    start = 0;
+    range = n_symb;
+    p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
+    while (range > 1) {
+        range0 = range >> 1;
+        p0 = vec_sum_f32(prob_table + start, range0);
+        prob0 = lrintf(p0 * PROB_UNIT / p);
+        prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
+        bit = sym >= (start + range0);
+        put_bit(pb, prob0, bit);
+        if (bit) {
+            start += range0;
+            range = range - range0;
+            p = p - p0;
+        } else {
+            p = p0;
+            range = range0;
+        }
+    }
+}
+
+int read_sym(GetBitState *gb, const float *prob_table, int n_symb)
+{
+    int start, range, prob0, bit, range0;
+    float p, p0;
+    
+    start = 0;
+    range = n_symb;
+    p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
+    while (range > 1) {
+        range0 = range >> 1;
+        p0 = vec_sum_f32(prob_table + start, range0);
+        prob0 = lrintf(p0 * PROB_UNIT / p);
+        prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
+        bit = get_bit(gb, prob0);
+        if (bit) {
+            start += range0;
+            range = range - range0;
+            p = p - p0;
+        } else {
+            p = p0;
+            range = range0;
+        }
+    }
+    return start;
+}
+
+void create_debug_dir(char *debug_dir, size_t debug_dir_size,
+                      const char *debug_path, const char *prefix)
+{
+    char name1[1024];
+    struct tm *tm;
+    time_t ti;
+    
+    snprintf(name1, sizeof(name1), "%s/%s", debug_path, prefix);
+#ifdef _WIN32
+    _mkdir(name1);
+#else
+    mkdir(name1, 0777);
+#endif
+    
+    ti = time(NULL);
+    tm = localtime(&ti);
+    snprintf(debug_dir, debug_dir_size, "%s/%04u%02u%02u-%02u%02u%02u",
+             name1,
+             tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+             tm->tm_hour, tm->tm_min, tm->tm_sec);
+#ifdef _WIN32
+    _mkdir(debug_dir);
+#else
+    mkdir(debug_dir, 0777);
+#endif
+}
+
+/* we print at least 3 significant digits with at most 5 chars, except
+   if larger than 9999T. The value is rounded to zero. */
+char *get_si_prefix(char *buf, int buf_size, uint64_t val)
+{
+    static const char suffixes[4] = "kMGT";
+    uint64_t base;
+    int i;
+
+    if (val <= 999) {
+        snprintf(buf, buf_size, "%" PRId64, val);
+    } else {
+        base = 1000;
+        for(i=0;i<4;i++) {
+            /* Note: we round to 0 */
+            if (val < base * 10) {
+                snprintf(buf, buf_size, "%0.2f%c", 
+                         floor((val * 100.0) / base) / 100.0,
+                         suffixes[i]);
+                break;
+            } else if (val < base * 100) {
+                snprintf(buf, buf_size, "%0.1f%c", 
+                         floor((val * 10.0) / base) / 10.0,
+                         suffixes[i]);
+                break;
+            } else if (val < base * 1000 || (i == 3)) {
+                snprintf(buf, buf_size,
+                         "%" PRId64 "%c", 
+                         val / base,
+                         suffixes[i]);
+                break;
+            }
+            base = base * 1000;
+        }
+    }
+    return buf;
+}
--- a/gpt2/cp_utils.h
+++ b/gpt2/cp_utils.h
@ -0,0 +1,48 @@
+/*
+ * Compression utilities
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "arith.h"
+#include "libnc.h"
+
+void __attribute__((noreturn, format(printf, 1, 2))) fatal_error(const char *fmt, ...);
+
+int64_t get_time_ms(void);
+void fput_u8(FILE *f, uint8_t v);
+int fget_u8(FILE *f, uint8_t *pv);
+void fput_be16(FILE *f, uint16_t v);
+int fget_be16(FILE *f, uint16_t *pv);
+void fput_be32(FILE *f, uint32_t v);
+int fget_be32(FILE *f, uint32_t *pv);
+void fput_f32(FILE *f, float v);
+int fget_f32(FILE *f, float *pv);
+void fput_sgd_opt(FILE *f, const SGDOptParams *p);
+int fget_sgd_opt(FILE *f, SGDOptParams *p);
+void dump_sgd_opt_params(FILE *f, const SGDOptParams *p);
+
+void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym);
+int read_sym(GetBitState *gb, const float *prob_table, int n_symb);
+
+void create_debug_dir(char *debug_dir, size_t debug_dir_size,
+                      const char *debug_path, const char *prefix);
+char *get_si_prefix(char *buf, int buf_size, uint64_t val);
+
--- a/gpt2/cutils.h
+++ b/gpt2/cutils.h
@ -0,0 +1,152 @@
+#ifndef CUTILS_H
+#define CUTILS_H
+
+#include <inttypes.h>
+
+#define force_inline inline __attribute__((always_inline))
+#define no_inline __attribute__((noinline))
+#define __unused __attribute__((unused))
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#ifndef offsetof
+#define offsetof(type, field) ((size_t) &((type *)0)->field)
+#endif
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+typedef int BOOL;
+
+#ifndef FALSE
+enum {
+    FALSE = 0,
+    TRUE = 1,
+};
+#endif
+
+typedef struct {
+    uint16_t u16;
+} bfloat16_t;
+
+#if defined(__x86_64__)
+static inline int64_t get_cycles(void)
+{
+    uint32_t low,high;
+    int64_t val;
+    asm volatile("rdtsc" : "=a" (low), "=d" (high));
+    val = high;
+    val <<= 32;
+    val |= low;
+    return val;
+}
+#else
+static inline int64_t get_cycles(void)
+{
+    int64_t val;
+    asm volatile ("rdtsc" : "=A" (val));
+    return val;
+}
+#endif
+
+static inline int max_int(int a, int b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline int min_int(int a, int b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline size_t max_size_t(size_t a, size_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline size_t min_size_t(size_t a, size_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline ssize_t max_ssize_t(ssize_t a, ssize_t b)
+{
+    if (a > b)
+        return a;
+    else
+        return b;
+}
+
+static inline ssize_t min_ssize_t(ssize_t a, ssize_t b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+
+static inline int clamp_int(int val, int min_val, int max_val)
+{
+    if (val < min_val)
+        return min_val;
+    else if (val > max_val)
+        return max_val;
+    else
+        return val;
+}
+
+static inline float clamp_float(float val, float min_val, float max_val)
+{
+    if (val < min_val)
+        return min_val;
+    else if (val > max_val)
+        return max_val;
+    else
+        return val;
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz32(unsigned int a)
+{
+    return __builtin_clz(a);
+}
+
+/* WARNING: undefined if a = 0 */
+static inline int clz64(uint64_t a)
+{
+    return __builtin_clzll(a);
+}
+
+static inline int floor_log2(uint64_t a)
+{
+    return 63 - clz64(a);
+}
+
+static inline int ceil_log2(uint64_t a)
+{
+    if (a <= 1)
+        return 0;
+    else
+        return 64 - clz64(a - 1);
+}
+
+static inline float squaref(float x)
+{
+    return x * x;
+}
+
+#define DUP8(a) a, a, a, a, a, a, a, a
+
+#endif /* CUTILS_H */
+
--- a/gpt2/gpt2tc.c
+++ b/gpt2/gpt2tc.c
--- a/gpt2/gpt2tc.h
+++ b/gpt2/gpt2tc.h
@ -0,0 +1,143 @@
+#ifndef _GPT2TC_H
+#define _GPT2TC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "cutils.h"
+#include "arith.h"
+#include "cp_utils.h"
+#include "list.h"
+#include "libnc.h"
+
+#define MAX_INITIAL_TEXT_LEN 256 /* in symbols */
+#define MAX_OUTPUT_LEN 100
+#define DEFAULT_TOP_K 40
+#define DEFAULT_TOP_P 0.9
+#define BATCH_SIZE_MAX 16
+//#define BATCH_SIZE_MAX 1
+
+
+typedef uint16_t DataSymbol;
+
+typedef enum {
+    GPT2_MODEL_117M,
+    GPT2_MODEL_345M,
+    GPT2_MODEL_774M,
+    GPT2_MODEL_1558M,
+} GPT2ModelEnum;
+
+typedef struct {
+    BOOL is_decoder;
+    int n_layer;
+    int d_model;
+    int n_head;
+    int d_key;
+    int d_value;
+    int d_inner;
+    int n_ctx;
+    int n_symbols;
+    uint32_t seed;
+} TransformerModelParams;
+
+typedef struct {
+    NCTensor *ln_1_g, *ln_1_b;
+    NCTensor *attn_w, *attn_b;
+    NCTensor *attn_proj_w, *attn_proj_b;
+
+    NCTensor *ln_2_g, *ln_2_b;
+    NCTensor *mlp_fc_w, *mlp_fc_b;
+    NCTensor *mlp_proj_w, *mlp_proj_b;
+} TransformerLayer;
+
+typedef struct {
+    RNDState rnd_state;
+    NCContext *model;
+    NCDevice *device;
+    int n_layer;
+    int d_model;
+    int n_head;
+    int d_key;
+    int d_value;
+    int d_inner;
+    int n_symbols;
+    int n_ctx;
+
+    /* parameters */
+    NCParamList param_list;
+    TransformerLayer *layers;
+    NCTensor *wte, *wpe, *wte_trans;
+    NCTensor *ln_f_g, *ln_f_b;
+} TransformerModel;
+
+typedef struct Word {
+    uint32_t next; /* -1 = end */
+    uint32_t len;
+    uint8_t *buf;
+} Word;
+
+typedef struct {
+    Word *words;
+    size_t word_count;
+    size_t word_size;
+    uint32_t *hash_table;
+    int hash_size;
+    int hash_bits;
+} WordList;
+
+typedef struct {
+    TransformerModel *trf_state;
+    WordList *wl;
+} TextCompleteGlobalState;
+
+typedef struct {
+    struct list_head link;
+    TextCompleteGlobalState *global_state;
+    int top_k;
+    float top_p;
+    float temperature;
+    RNDState rnd_state;
+    NCTensor **mem_k, **mem_v;
+    DataSymbol *input_buf;
+    int input_buf_len;
+    int text_len; /* current input text len */
+    BOOL is_first;
+    int last_c;
+    int max_output_len;
+
+    /* output */
+    char out_text[1024];
+    int out_text_len; /* 0 means end of output */
+} TextGenContext;
+
+GPT2ModelEnum parse_model(const char *str);
+void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model);
+void gpt2_pp_encode(const char *word_filename, const char *in_filename, const char *out_filename);
+size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf, const uint8_t *buf, size_t buf_size);
+void gpt2_pp_decode(const char *word_filename, const char *in_filename, const char *out_filename);
+char *trim_text(const char *str);
+TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model, const char *filename);
+void text_complete_global_end(TextCompleteGlobalState *tcs);
+TextGenContext *text_complete_start(TextCompleteGlobalState *tcs, const char *input_text, int top_k, float top_p, float temperature, int seed, int max_output_len);
+void text_complete_next(TextCompleteGlobalState *tcs, struct list_head *ts_list);
+void text_complete_end(TextGenContext *ts);
+void text_complete(GPT2ModelEnum model, const char *model_filename, const char *input_text, int top_k, float top_p, float temperature, int max_output_len, int batch_size, int seed, BOOL verbose);
+int unicode_to_utf8(uint8_t *buf, unsigned int c);
+int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
+size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits);
+ssize_t convert_from_chars(uint8_t **pout_buf, const char *str);
+int encode_length(PutBitState *pb, uint32_t val);
+int decode_length(GetBitState *gb);
+int text_decompress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text);
+int text_compress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text, BOOL dump_stats);
+void text_compress_test(GPT2ModelEnum model, const char *model_filename, const char *input_text, BOOL is_decode, BOOL verbose);
+int file_compress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
+int file_decompress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/gpt2/gpt2vocab.txt
+++ b/gpt2/gpt2vocab.txt
--- a/gpt2/libnc.h
+++ b/gpt2/libnc.h
@ -0,0 +1,426 @@
+/*
+ * LibNC
+ * 
+ * Copyright (c) 2018-2019 Fabrice Bellard
+ *
+ */
+#ifndef LIBNC_H
+#define LIBNC_H
+
+#include <inttypes.h>
+#include "cutils.h"
+#include "list.h"
+
+/* profiling */
+
+typedef enum {
+    PROF_EVAL,
+    PROF_GRAD,
+    PROF_SGD,
+    PROF_UPDATE,
+    PROF_WRITE_SYM,
+    PROF_PROBE,
+    PROF_TOTAL,
+    PROF_COUNT,
+} ProfEnum;
+
+#ifdef PROFILE
+
+extern int64_t prof_cycles[PROF_COUNT];
+extern int64_t prof_samples[PROF_COUNT];
+extern int64_t prof_ops[PROF_COUNT];
+
+static inline void prof_start(int idx)
+{
+    prof_cycles[idx] -= get_cycles();
+}
+
+static inline void prof_end(int idx)
+{
+    prof_cycles[idx] += get_cycles();
+    prof_samples[idx]++;
+}
+
+static inline void prof_end_ops(int idx, int n_ops)
+{
+    prof_cycles[idx] += get_cycles();
+    prof_ops[idx] += n_ops;
+    prof_samples[idx]++;
+}
+
+#else
+
+static inline void prof_start(int idx)
+{
+}
+
+static inline void prof_end(int idx)
+{
+}
+
+static inline void prof_end_ops(int idx, int n_ops)
+{
+}
+
+#endif
+
+void nc_prof_dump(void);
+
+/* Automatic Differentiation Engine */
+
+typedef struct NCContext NCContext;
+typedef struct NCDevice NCDevice;
+typedef struct NCTensor NCTensor;
+typedef struct NCTensorBuffer NCTensorBuffer;
+typedef struct NCNode NCNode;
+typedef struct NCRNDState NCRNDState;
+typedef struct NCSGDOptState NCSGDOptState;
+
+typedef enum {
+    NC_TYPE_F32,
+    NC_TYPE_BF16,
+    NC_TYPE_F16,
+    NC_TYPE_I8,
+    NC_TYPE_I16,
+    NC_TYPE_I32,
+    NC_TYPE_COUNT,
+} NCTypeEnum;
+
+extern size_t nc_type_size_table[NC_TYPE_COUNT];
+extern const char *nc_type_name_table[NC_TYPE_COUNT];
+
+#define NC_N_DIMS_MAX 4 /* maximum number of axis for tensors */
+
+typedef struct NCTensorData {
+    NCTypeEnum item_type;
+    size_t item_size;
+    void *data;
+    size_t stride; /* in elements */
+    size_t n_strides; /* prod(j = 1 ... n_dims, dims[j]); */
+    int n_dims;
+    const size_t *dims; /* n_dims length */
+    const size_t *strides; /* n_dims length, strides in bytes */
+} NCTensorData;
+
+void *nc_malloc(size_t size);
+void *nc_mallocz(size_t size);
+void nc_free(void *ptr);
+
+NCContext *nc_context_init(int nb_threads);
+void nc_context_end(NCContext *m);
+
+NCDevice *nc_new_cpu_device(NCContext *m);
+NCDevice *nc_new_cuda_device(NCContext *m, int device_index);
+NCDevice *nc_new_device(NCContext *m, const char *device_name);
+void nc_synchronize(NCDevice *d);
+
+NCTensorBuffer *nc_new_tensor_buffer(NCDevice *d, size_t size);
+NCTensorBuffer *nc_dup_tensor_buffer(const NCTensorBuffer *b);
+void nc_free_tensor_buffer(NCTensorBuffer *b);
+
+NCTensor *nc_new_tensor(NCDevice *d, NCTypeEnum type,
+                        int n_dims, const size_t *dims);
+NCTensor *nc_new_tensor_from_tensor(const NCTensor *x);
+NCTensor *nc_new_tensor_from_tensor_nz(const NCTensor *x);
+NCTensor *nc_new_scalar(NCDevice *d, NCTypeEnum type);
+NCTensor *nc_new_tensor_1d(NCDevice *d, NCTypeEnum type, size_t len);
+NCTensor *nc_new_tensor_2d(NCDevice *d, NCTypeEnum type, size_t n0, size_t n1);
+NCTensor *nc_new_tensor_3d(NCDevice *d, NCTypeEnum type,
+                           size_t n0, size_t n1, size_t n2);
+NCTensor *nc_new_tensor_4d(NCDevice *d, NCTypeEnum type,
+                           size_t n0, size_t n1, size_t n2, size_t n3);
+NCTensor *__attribute__((format(printf, 2, 3))) nc_tensor_set_name(NCTensor *x, const char *fmt, ...);
+NCTensor *nc_dup_tensor(const NCTensor *x);
+void nc_free_tensor(NCTensor *x);
+void nc_dump_tensor(const char *name, NCTensor *x, size_t n);
+uint32_t nc_tensor_get_hash(NCTensor *x);
+void nc_dump_tensor_hash(const char *name, const NCTensor *x);
+NCNode *nc_get_node(NCTensor *x);
+/* create an alias to tensor 'x1'. Gradient is not propagated thru it */
+NCTensor *nc_slice_alias(const NCTensor *x1, int axis, size_t start, size_t end);
+
+NCTypeEnum nc_tensor_get_item_type(const NCTensor *x);
+NCTensorData *nc_tensor_get_data(NCTensorData *sd, const NCTensor *x);
+/* Return a pointer to the tensor data. If *pstride is non NULL,
+   return the stride (in elements) of the first dimension. */
+void *nc_tensor_get_ptr(NCTensor *x, size_t *pstride);
+const size_t *nc_tensor_get_dims(const NCTensor *x, int *pn_dims);
+void nc_tensor_set_zero(NCTensor *y);
+void nc_tensor_set_f32(NCTensor *y, float val);
+NCRNDState *nc_rnd_init(NCDevice *d, uint32_t seed);
+void nc_rnd_end(NCRNDState *s);
+void nc_tensor_set_rnd_unif(NCTensor *y, float avg, float range,
+                            NCRNDState *rnd_state);
+void nc_tensor_set_dropout(NCTensor *y, float prob, NCRNDState *rnd_state);
+
+void nc_set1_i32(NCTensor *y, int n_dims, const size_t *tab_indexes,
+                 int32_t val);
+void nc_set1_i32_1d(NCTensor *y, size_t i0, int32_t val);
+void nc_set1_i32_2d(NCTensor *y, size_t i0, size_t i1, int32_t val);
+void nc_set1_f32(NCTensor *y, int n_dims, const size_t *tab_indexes,
+                 float val);
+void nc_set1_f32_1d(NCTensor *y, size_t i0, float val);
+
+int32_t nc_get1_i32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
+float nc_get1_f32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
+float nc_get1_f32_1d(const NCTensor *x, size_t i0);
+float nc_get_scalar_f32(const NCTensor *x);
+
+void nc_tensor_copy(NCTensor *dst, NCTensor *src);
+void nc_tensor_convert(NCTensor *dst, NCTensor *src);
+
+void nc_dump_dims(const char *str, NCTensor *x);
+size_t nc_get_heap_size(NCContext *m);
+NCContext *nc_get_tensor_context(const NCTensor *x);
+NCTensor *nc_tensor_to_device(NCTensor *x, NCDevice *d);
+NCTensor *nc_tensor_to_cpu_device(NCTensor *x);
+NCDevice *nc_get_tensor_device(const NCTensor *x);
+                                 
+/* element wise operations */
+NCTensor *nc_convert(NCTensor *x, NCTypeEnum new_type);
+NCTensor *nc_add(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_neg(NCTensor *x);
+NCTensor *nc_sub(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_mul(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_div(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_recip(NCTensor *x);
+NCTensor *nc_min(NCTensor *x1, NCTensor *x2);
+NCTensor *nc_max(NCTensor *x1, NCTensor *x2);
+/* select x1[i] if z[i] = 0 and x2[i] otherwise */
+NCTensor *nc_select(NCTensor *z, NCTensor *x1, NCTensor *x2);
+/* set y[i] = x1[i] if mask[i] = 0 and y[i] = c if mask[i] != 0. If
+   mask_inv is TRUE, 'mask' is inverted */
+NCTensor *nc_masked_fill(NCTensor *x, NCTensor *mask, float c, BOOL mask_inv);
+NCTensor *nc_sigmoid(NCTensor *x);
+NCTensor *nc_tanh(NCTensor *x);
+NCTensor *nc_relu(NCTensor *x);
+NCTensor *nc_gelu(NCTensor *x);
+NCTensor *nc_log(NCTensor *x);
+/* return cp * fg + min(1 - fg, ig) * in */
+NCTensor *nc_lstm_clamped(NCTensor *cp, NCTensor *in,
+                          NCTensor *fg, NCTensor *ig);
+/* return a * (1 - t) + b * t */
+NCTensor *nc_lerp(NCTensor *a, NCTensor *b, NCTensor *t);
+
+/* other operations */
+NCTensor *nc_new_vec_f32(NCDevice *d, size_t n, float val);
+NCTensor *nc_new_f32(NCDevice *d, float val);
+NCTensor *nc_reshape(NCTensor *x, int n_dims, const size_t *dims);
+NCTensor *nc_reshape_1d(NCTensor *x, size_t n0);
+NCTensor *nc_reshape_2d(NCTensor *x, size_t n0, size_t n1);
+NCTensor *nc_reshape_3d(NCTensor *x, size_t n0, size_t n1, size_t n2);
+NCTensor *nc_reshape_4d(NCTensor *x, size_t n0, size_t n1, size_t n2,
+                        size_t n3);
+/* duplicate the tensor by adding n_dims dimensions */
+NCTensor *nc_repeat(NCTensor *x, int n_dims, const size_t *dims);
+NCTensor *nc_repeat_1d(NCTensor *x, size_t n);
+/* return y0 + sum over the dimensions > n_dims of 'x'. y0 = NULL
+   is supported */
+NCTensor *nc_reduce_sum(NCTensor *y0, NCTensor *x, int n_dims);
+/* sum all the elements of a tensor */
+NCTensor *nc_sum(NCTensor *x);
+/* sum of squares */
+NCTensor *nc_reduce_sum_sqr(NCTensor *x);
+NCTensor *nc_slice(NCTensor *x, int axis, size_t start, size_t end);
+NCTensor *nc_slice_add(NCTensor *y0, NCTensor *x, int axis, size_t start);
+/* concatenation along axis 'axis' */
+NCTensor *nc_concat(NCTensor **inputs, int n_inputs, int axis);
+/* shortcut for axis = 0 */
+NCTensor *nc_vconcat(NCTensor **inputs, int n_inputs);
+/* shortcut for axis = 1 */
+NCTensor *nc_hconcat(NCTensor **inputs, int n_inputs);
+/* split along axis 'axis'. If tab_size = NULL, split equally. */
+void nc_split(NCTensor **tab_y, NCTensor *x, int n_outputs,
+              const size_t *tab_size, int axis);
+/* shortcut for axis = 0 */
+void nc_vsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
+               const size_t *tab_size);
+/* shortcut for axis = 1 */
+void nc_hsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
+               const size_t *tab_size);
+
+typedef enum {
+    NC_PAD_ZERO,
+    NC_PAD_DUP, /* duplicate element */
+    /* trim types, dual to padding */
+    NC_TRIM_NORMAL = NC_PAD_ZERO,
+    NC_TRIM_SUM, /* add trimmed elements to the edge */
+} NCPadEnum;
+
+/* pad (len > 0) or trim (len < 0) the axis 0 of 'x' */
+NCTensor *nc_pad(NCTensor *x, ssize_t left_len, NCPadEnum left_op,
+                 ssize_t right_len, NCPadEnum right_op);
+/* shortcut to nc_pad() */
+NCTensor *nc_resize(NCTensor *x, size_t n);
+
+/* if x is not contiguous then create a new contiguous tensor and copy
+   x to it. Otherwise, return 'x'. */
+NCTensor *nc_make_contiguous(NCTensor *x);
+/* Return a new tensor sharing the same buffer as 'x' with the permuted
+   dimensions. axis[i] is the corresponding axis in 'x' */
+NCTensor *nc_permute_alias(NCTensor *x, int n_dims, const int *axis);
+/* same as nc_permute_alias but calls nc_make_contiguous after. */
+NCTensor *nc_permute(NCTensor *x, int n_dims, const int *axis);
+/* special case of nc_permute() */
+NCTensor *nc_transpose(NCTensor *x);
+NCTensor *nc_matmul(NCTensor *w, NCTensor *x);
+/* return w*x + y0. w and x can be optionally transposed. y0 can be NULL */
+NCTensor *nc_matmul_add(NCTensor *w, NCTensor *x, NCTensor *y0,
+                        BOOL w_trans, BOOL x_trans);
+NCTensor *nc_matmul_stride(NCTensor *w, NCTensor *x);
+/* return a matrix where each column is the column x[i] of matrix 'w' */
+NCTensor *nc_get_col(NCTensor *w, NCTensor *x);
+/* add the vectors 'z' at column number 'x' in matrix 'w'. */
+NCTensor *nc_add_col(NCTensor *z, NCTensor *x, NCTensor *w);
+/* select the x-th element in each column of 'w' */
+NCTensor *nc_get_element(NCTensor *w, NCTensor *x);
+/* add z to the x-th element in each column of 'w' */
+NCTensor *nc_add_element(NCTensor *z, NCTensor *x, NCTensor *w);
+NCTensor *nc_soft_max(NCTensor *x);
+/* Equivalent to y = log(get_element(x, eout)). It is expected to be
+   used as nc_index_log(nc_soft_max(x), eout) so that the gradient
+   computation is optimized. */
+NCTensor *nc_indexed_log(NCTensor *x, NCTensor *eout);
+NCTensor *nc_layer_norm(NCTensor *x, float eps);
+NCTensor *nc_rms_norm(NCTensor *x, float eps);
+NCTensor *nc_slt_mat_set(NCTensor *x, size_t pos, float c);
+/* shift the column 'i' by 'pos + i * mult' elements and pad with with zeros */
+NCTensor *nc_rel_shift(NCTensor *x, ssize_t pos, ssize_t mult);
+
+/* auto differentiation */
+
+/* get_col_index is non NULL in the sparse gradient case */
+typedef void NCParamUpdateFunc(void *opaque, NCTensor *grad,
+                               NCTensor *get_col_index);
+
+/* add a 'parameter' graph node to 'x' and return 'x'. */
+NCTensor *nc_set_param(NCTensor *x, void *opaque);
+/* return a new tensor with its graph removed */
+NCTensor *nc_stop_grad(NCTensor *x);
+
+/* manipulation of graph nodes */
+NCNode *nc_dup_node(const NCNode *n);
+void nc_free_node(NCNode *n);
+void nc_combine_nodes(NCContext *m, NCNode **tab_op1, int count,
+                      int axis, int elem_size, const size_t *tab_elem_size);
+NCNode *nc_concat_node(NCContext *m, NCNode **inputs, int count,
+                       int axis, const size_t *tab_size);
+void nc_concat_optimization(NCContext *m, NCNode **concat_nodes, int count);
+void nc_node_set_parent(NCNode *n, int arg_index, const NCNode *n1);
+void nc_node_set_arg(NCNode *n, int arg_index, const NCTensor *x);
+
+#define NC_BW_KEEP_GRAD_GRAPH (1 << 0)
+/* optimize the nc_get_col() gradient */
+#define NC_BW_SPARSE_GRAD     (1 << 1)
+
+void nc_backward(const NCTensor *x, NCTensor *grad,
+                 NCParamUpdateFunc *param_update_func, int flags);
+void nc_dump_graph(NCTensor *x);
+
+/* utilities for function parameters */
+
+typedef struct {
+    struct list_head link;
+    NCTensor **pval; /* pointer to the tensor location */
+    char *name; /* parameter name */
+    NCTensor *low_part; /* if BF16 parameter, additional 16 bit precision */
+    NCTensor *saved_grad; /* debug */
+    /* SGD opt data */
+    struct SGDOptVarState *sgd_opt;
+} NCParam;
+
+typedef struct {
+    struct list_head param_list;
+    BOOL add_graph;
+} NCParamList;
+
+void nc_param_list_init(NCParamList *pl);
+void nc_param_list_set_graph(NCParamList *pl, BOOL add_graph);
+NCParam *nc_new_param_str(NCParamList *pl, NCTensor **pval, const char *str);
+__attribute__((format(printf, 3, 4))) NCParam *nc_new_param(NCParamList *pl, NCTensor **pval, const char *fmt, ...);
+void nc_param_list_end(NCParamList *pl);
+
+NCParam *nc_find_param(NCParamList *pl, const char *name);
+size_t nc_get_param_count(NCParamList *pl);
+void nc_save_coefs(NCParamList *pl, const char *filename);
+void nc_load_coefs(NCParamList *pl, const char *filename);
+void nc_save_state(NCParamList *pl, const char *filename);
+void nc_load_state(NCParamList *pl, const char *filename);
+
+/* SGD optimizer */
+
+typedef enum {
+    SGD_OPT_BASIC,
+    SGD_OPT_ADAM,
+    SGD_OPT_TEST,
+} SGDOptAlgoEnum;
+
+typedef struct {
+    SGDOptAlgoEnum algo;
+    union {
+        struct {
+            float beta1;
+            float beta2;
+            float eps;
+            float gradient_clip; /* if != 0, per parameter gradient clipping */
+        } adam;
+    } u;
+    float lr;
+} SGDOptParams;
+
+NCSGDOptState *nc_sgd_opt_init(NCContext *m, const SGDOptParams *p);
+void nc_sgd_opt_end(NCSGDOptState *s);
+void sgd_opt_update_var(void *opaque, NCTensor *yg, NCTensor *get_col_index);
+
+/* set the SGD optimizer 's' to all parameters of the model */
+void nc_sgd_opt_set_all(NCParamList *param_list, NCSGDOptState *s);
+
+/* set the SGD optimizer 's' to the variable 'x'. Remove it if s = NULL */
+void nc_sgd_opt_set(NCParam *x, NCSGDOptState *s);
+void nc_sgd_opt_update(NCSGDOptState *s);
+/* force the learning rate */
+void nc_sgd_opt_set_lr(NCSGDOptState *s, float lr);
+float nc_sgd_opt_get_lr(NCSGDOptState *s);
+
+/* for SGD_OPT_TEST */
+NCTensor *nc_sgd_opt_get_grad(NCParam *p);
+
+/* misc utilities (to be removed) */
+
+typedef struct {
+    uint32_t seed;
+    /* used by Gaussian generator */
+    int idx;
+    float y1;
+} RNDState;
+
+typedef struct {
+    uint16_t u16;
+} nc_float16_t;
+
+void rnd_init(RNDState *s, uint32_t seed);
+uint32_t rnd_unif_u32(RNDState *s);
+float rnd_unif(RNDState *s);
+void rnd_unif_vec(float *tab, size_t n, float mu, float range,
+                  RNDState *s);
+void rnd_unif_mat(float *tab, size_t stride, size_t h, size_t w,
+                  float mu, float sigma, RNDState *s);
+
+float vec_sum_f32(const float *tab, size_t n);
+
+typedef struct  {
+    float val;
+    uint32_t idx;
+} NCTopKEntry;
+
+/* Return the k largest values among prob[0...n_symb-1] such that k is
+   the largest value such that k <= topk and sum(i=0 .. k - 2,
+   prob[tab[i]]) < topp.
+
+   It is assumed that prob[i] >= 0. The function returns (k, tab,
+   sum). 'sum' is the sum of the k returned values. 'tab' must be
+   freed with nc_free(). */
+int nc_topk(NCTopKEntry **ptab, double *psum,
+            const float *prob, size_t n, int topk, float topp);
+
+#endif /* LIBNC_H */
--- a/gpt2/list.h
+++ b/gpt2/list.h
@ -0,0 +1,96 @@
+/*
+ * Linux klist like system
+ * 
+ * Copyright (c) 2016-2017 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef LIST_H
+#define LIST_H
+
+struct list_head {
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+#define LIST_HEAD_INIT(el) { &(el), &(el) }
+
+/* return the pointer of type 'type *' containing 'el' as field 'member' */
+#define list_entry(el, type, member) \
+    ((type *)((uint8_t *)(el) - offsetof(type, member)))
+
+static inline void init_list_head(struct list_head *head)
+{
+    head->prev = head;
+    head->next = head;
+}
+
+/* insert 'el' between 'prev' and 'next' */
+static inline void __list_add(struct list_head *el, 
+                              struct list_head *prev, struct list_head *next)
+{
+    prev->next = el;
+    el->prev = prev;
+    el->next = next;
+    next->prev = el;
+}
+
+/* add 'el' at the head of the list 'head' (= after element head) */
+static inline void list_add(struct list_head *el, struct list_head *head)
+{
+    __list_add(el, head, head->next);
+}
+
+/* add 'el' at the end of the list 'head' (= before element head) */
+static inline void list_add_tail(struct list_head *el, struct list_head *head)
+{
+    __list_add(el, head->prev, head);
+}
+
+static inline void list_del(struct list_head *el)
+{
+    struct list_head *prev, *next;
+    prev = el->prev;
+    next = el->next;
+    prev->next = next;
+    next->prev = prev;
+    el->prev = NULL; /* fail safe */
+    el->next = NULL; /* fail safe */
+}
+
+static inline int list_empty(struct list_head *el)
+{
+    return el->next == el;
+}
+
+#define list_for_each(el, head) \
+  for(el = (head)->next; el != (head); el = el->next)
+
+#define list_for_each_safe(el, el1, head)                \
+    for(el = (head)->next, el1 = el->next; el != (head); \
+        el = el1, el1 = el->next)
+
+#define list_for_each_prev(el, head) \
+  for(el = (head)->prev; el != (head); el = el->prev)
+
+#define list_for_each_prev_safe(el, el1, head)           \
+    for(el = (head)->prev, el1 = el->prev; el != (head); \
+        el = el1, el1 = el->prev)
+
+#endif /* LIST_H */
--- a/gpt2/readme.txt
+++ b/gpt2/readme.txt
@ -0,0 +1,86 @@
+GPT-2 text completion and compression demo
+==========================================
+
+1) Usage
+--------
+
+Extract the 117M GPT-2 model to the gpt2tc directory:
+
+tar xtf gpt2tc-117M.tar.gz
+
+Text completion example:
+
+./gpt2tc g "Hello, my name is"
+
+Use more CPU cores (only faster on server CPUs):
+
+./gpt2tc -T 4 g "Hello, my name is"
+
+Short Text compression and decompression example:
+
+./gpt2tc cs "Hello, how are you ?"
+
+./gpt2tc ds "姯敳痪"
+
+Text compression example:
+
+./gpt2tc c in.txt out.bin
+
+Decompression:
+
+./gpt2tc d out.bin out.txt
+
+2) Using larger models
+----------------------
+
+The smallest GPT-2 model (117M) is provided in a separate
+archive. Larger models can be built by downloading the TensorFlow
+parameters and converting them with the attached script. Example:
+
+# download the model to models/345M
+./download_model.sh 345M
+
+# convert it to the gpt2tc format:
+python3 gpt2convert.py models/345M gpt2_345M.bin
+
+# use it
+./gpt2tc -m 345M g "Hello, how are you ?"
+
+3) Compression results
+----------------------
+
+File          Model  Original size Compr. size  Ratio  CMIX v18
+            #params        (bytes)     (bytes)  (bpb)  ratio (bpb)
+book1          117M         768771      152283   1.58  1.82
+book1          345M         768771      142183   1.48
+book1          774M         768771      137562   1.43
+book1         1558M         768771      134217   1.40
+
+alice29.txt    117M         152089       23615   1.24  1.65
+alice29.txt    345M         152089       20587   1.08
+alice29.txt    774M         152089       19096   1.00
+alice29.txt   1558M         152089       17382   0.91
+
+enwik5         117M         100000       14875   1.19  1.60
+enwik5         345M         100000       13511   1.08
+enwik5         774M         100000       13240   1.06
+enwik5        1558M         100000       12918   1.03
+
+Notes:
+- book1 comes from the Calgary corpus.
+- alice29.txt comes from the Canterbury corpus.
+- enwik5 contains the first 100000 bytes of the English
+  Wikipedia dump of March 3, 2006
+  (http://mattmahoney.net/dc/textdata.html).
+- For best performance, use the UTF-8 encoding and don't mix CRLF and
+  LF line breaks.
+- For reference, the results of CMIX
+  (http://www.byronknoll.com/cmix.html) are provided.
+
+4) More information
+-------------------
+
+This demo has no external dependency. It is written in C and uses the
+LibNC library for tensor manipulation. The CPU must support AVX2.
+
+A similar program is used for http://textsynth.org/
--- a/justlm.hpp
+++ b/justlm.hpp
@ -0,0 +1,54 @@
+#ifndef LLM_H
+#define LLM_H
+#include <iostream>
+#include <string>
+#include <vector>
+#include <functional>
+#include <memory>
+#include <thread>
+
+
+class LLM {
+    struct {
+        int32_t seed; // RNG seed
+        int32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 2;
+        union {
+            int32_t n_ctx; // Context size, llama.cpp specific
+            int32_t n_prompt = -1; // Prompt size, gpt2 specific
+        };
+        int32_t n_batch = 8; // Batch size, unused
+
+        int32_t top_k = 40;
+        float   top_p = 0.5f;
+        float   temp  = 0.72f;
+    } params;
+
+    struct State *state;
+
+    void init(const std::string& weights_path);
+
+    static
+    bool ends_with(std::string_view str, std::string_view suffix);
+
+public:
+    struct Exception : public std::runtime_error {
+        using std::runtime_error::runtime_error;
+    };
+    struct ContextLengthException : public Exception {
+        ContextLengthException() : Exception("Max. context length exceeded") {}
+    };
+
+    LLM(const std::string& weights_path, int32_t seed = 0) {
+        // Set random seed
+        params.seed = seed?seed:time(NULL);
+
+        // Initialize llm
+        init(weights_path);
+    }
+    ~LLM();
+
+    void append(std::string_view prompt, const std::function<bool (float progress)>& on_tick = nullptr);
+
+    std::string run(std::string_view end, const std::function<bool (const char *generated)>& on_tick = nullptr);
+};
+#endif // LLM_H
--- a/libjustlm_core.cpp
+++ b/libjustlm_core.cpp
@ -0,0 +1,9 @@
+#include "justlm.hpp"
+
+#include <string_view>
+
+
+
+bool LLM::ends_with(std::string_view str, std::string_view suffix) {
+    return str.size() >= suffix.size() && 0 == str.compare(str.size()-suffix.size(), suffix.size(), suffix);
+}
--- a/libjustlm_gpt2.cpp
+++ b/libjustlm_gpt2.cpp
@ -0,0 +1,80 @@
+#include "justlm.hpp"
+#include "gpt2/gpt2tc.h"
+
+#include <filesystem>
+#include <cstring>
+
+
+struct State {
+    std::string prompt;
+    std::string model_path;
+    GPT2ModelEnum model;
+} state;
+
+
+
+void LLM::init(const std::string& weights_path) {
+    state->model_path = weights_path;
+    // Get weight file size
+    auto weights_size = std::filesystem::file_size(weights_path);
+    // Determine weight size
+    switch (weights_size) {
+    case 250700242: state->model = GPT2_MODEL_117M; break;
+    case 3120522738: state->model = GPT2_MODEL_1558M; break;
+    case 712396722: state->model = GPT2_MODEL_345M; break;
+    case 1551900050: state->model = GPT2_MODEL_774M; break;
+    default: throw Exception("Unknown model size");
+    }
+}
+
+LLM::~LLM() {
+    delete state;
+}
+
+void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
+    state->prompt.append(prompt);
+    std::cout << prompt << std::endl;
+}
+
+std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
+    std::string fres;
+    TextCompleteGlobalState *tcs;
+    TextGenContext *ts;
+    int count;
+    struct timeval tv;
+    struct list_head ts_list;
+
+    // Initialize completion
+    tcs = text_complete_global_init(state->model, state->model_path.c_str());
+
+    // Run completion
+    ts = text_complete_start(tcs, state->prompt.c_str(), params.top_k, params.top_p, params.temp,
+                             params.seed, params.n_prompt>0?params.n_prompt:0xfffffff - state->prompt.size());
+    bool abort = false;
+    while (!abort && !ends_with(fres, end)) {
+        // Run completion
+        init_list_head(&ts_list);
+        list_add_tail(&ts->link, &ts_list);
+        text_complete_next(tcs, &ts_list);
+        if (ts->out_text_len == 0)
+            break;
+        auto str = std::string_view{ts->out_text, static_cast<std::string_view::size_type>(ts->out_text_len)};
+
+        // Append result to fres
+        fres.append(str);
+
+        // Tick
+        if (on_tick && !on_tick(std::string(str).c_str()) /*Huge overhead in favor of llama.cpp*/) abort = true;
+    }
+    // End completion
+    text_complete_end(ts);
+
+    text_complete_global_end(tcs);
+
+    // Create final string  TODO: Could be optimized
+    state->prompt.append(fres);
+    fres = std::string(fres.data(), fres.size()-end.size());
+
+    // Return final string
+    return fres;
+}
--- a/libjustlm_llama.cpp
+++ b/libjustlm_llama.cpp
@ -0,0 +1,115 @@
+#include "justlm.hpp"
+
+#include <ggml.h>
+#include <llama.h>
+
+
+struct State {
+    llama_context *ctx = nullptr;
+    std::string prompt;
+    std::vector<int> embd;
+    int n_ctx;
+    std::string last_result;
+} state;
+
+
+
+void LLM::init(const std::string& weights_path) {
+    // Allocate state
+    state = new State;
+
+    // Get llama parameters
+    auto lparams = llama_context_default_params();
+    lparams.seed = params.seed;
+    lparams.n_ctx = params.n_ctx>0?params.n_ctx:2024;
+
+    // Create context
+    state->ctx = llama_init_from_file(weights_path.c_str(), lparams);
+    if (!state->ctx) {
+        throw Exception("Failed to initialize llama from file");
+    }
+
+    // Initialize some variables
+    state->n_ctx = llama_n_ctx(state->ctx);
+}
+
+LLM::~LLM() {
+    if (state->ctx) llama_free(state->ctx);
+    delete state;
+}
+
+void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
+    // Check if prompt was empty
+    const bool was_empty = state->prompt.empty();
+
+    // Append to current prompt
+    state->prompt.append(prompt);
+
+    // Resize buffer for tokens
+    const auto old_token_count = state->embd.size();
+    state->embd.resize(old_token_count+state->prompt.size()+1);
+
+    // Run tokenizer
+    const auto token_count = llama_tokenize(state->ctx, prompt.data(), state->embd.data()+old_token_count, state->embd.size()-old_token_count, was_empty);
+    state->embd.resize(old_token_count+token_count);
+
+    // Make sure limit is far from being hit
+    if (state->embd.size() > state->n_ctx-6) {
+        // Yup. *this MUST be decomposed now.
+        throw ContextLengthException();
+    }
+
+    // Evaluate new tokens
+    // TODO: Larger batch size
+    std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state->embd.size() << '/' << state->n_ctx << std::endl;
+    for (int it = old_token_count; it != state->embd.size(); it++) {
+        std::cout << llama_token_to_str(state->ctx, state->embd.data()[it]) << std::flush;
+        llama_eval(state->ctx, state->embd.data()+it, 1, it, params.n_threads);
+
+        // Tick
+        if (on_tick) {
+            // Calculate progress
+            auto progress = float(it-old_token_count) / (state->embd.size()-old_token_count) * 100.f;
+            // Run callback
+            if (!on_tick(progress)) break;
+        }
+    }
+    std::cout << std::endl;
+}
+
+std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
+    std::string fres;
+
+    // Loop until done
+    bool abort = false;
+    while (!abort && !ends_with(fres, end)) {
+        // Sample top p and top k
+        const auto id = llama_sample_top_p_top_k(state->ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);
+
+        // Add token
+        state->embd.push_back(id);
+
+        // Get token as string
+        const auto str = llama_token_to_str(state->ctx, id);
+
+        // Debug
+        std::cout << str << std::flush;
+
+        // Append string to function result
+        fres.append(str);
+
+        // Evaluate token
+        //  TODO: Respect batch size
+        llama_eval(state->ctx, state->embd.data()+state->embd.size()-1, 1, state->embd.size()-1, params.n_threads);
+
+        // Tick
+        if (on_tick && !on_tick(str)) abort = true;
+    }
+
+    // Create final string  TODO: Could be optimized
+    state->prompt.append(fres);
+    fres = std::string(fres.data(), fres.size()-end.size());
+
+    // Return final string
+    return fres;
+}
--- a/llama.cpp
+++ b/llama.cpp
@ -0,0 +1 @@
+Subproject commit 9cbc404ba6699a9ba4925ea25a60552b13491c7a
--- a/test.cpp
+++ b/test.cpp
@ -0,0 +1,12 @@
+#include "ai.hpp"
+
+#include <iostream>
+
+
+
+int main() {
+    Ai ai;
+    std::cout << "Completing \"she replied that\"..." << std::endl;
+    std::cout << "Using model " << ai.model_name << "..." << std::endl;
+    std::cout << "> she replied that" << ai.complete("she replied that", '\n') << std::endl;
+}
				`@ -0,0 +1 @@`
				`Subproject commit 9cbc404ba6699a9ba4925ea25a60552b13491c7a`