commit aaddcc0cbd10c9e7de46c6ad1ad6c4a20a9eaf17 Author: niansa Date: Thu Mar 30 07:03:33 2023 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a0b530 --- /dev/null +++ b/.gitignore @@ -0,0 +1,74 @@ +# This file is used to ignore files which are generated +# ---------------------------------------------------------------------------- + +*~ +*.autosave +*.a +*.core +*.moc +*.o +*.obj +*.orig +*.rej +*.so +*.so.* +*_pch.h.cpp +*_resource.rc +*.qm +.#* +*.*# +core +!core/ +tags +.DS_Store +.directory +*.debug +Makefile* +*.prl +*.app +moc_*.cpp +ui_*.h +qrc_*.cpp +Thumbs.db +*.res +*.rc +/.qmake.cache +/.qmake.stash + +# qtcreator generated files +*.pro.user* +CMakeLists.txt.user* + +# xemacs temporary files +*.flc + +# Vim temporary files +.*.swp + +# Visual Studio generated files +*.ib_pdb_index +*.idb +*.ilk +*.pdb +*.sln +*.suo +*.vcproj +*vcproj.*.*.user +*.ncb +*.sdf +*.opensdf +*.vcxproj +*vcxproj.* + +# MinGW generated files +*.Debug +*.Release + +# Python byte code +*.pyc + +# Binaries +# -------- +*.dll +*.exe + diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..0477fdd --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "llama.cpp"] + path = llama.cpp + url = https://github.com/ggerganov/llama.cpp.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..858d069 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.14) + +project(libjustlm LANGUAGES C CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_library(libjustlm STATIC + libjustlm_core.cpp + justlm.hpp +) + +set(LM_BACKEND "llama.cpp" CACHE STRING "The language model backend to use") + +if (LM_BACKEND STREQUAL "libnc gpt2") + add_library(libjustlm_gpt2 STATIC libjustlm_gpt2.cpp gpt2/arith.c gpt2/cp_utils.c gpt2/gpt2tc.c) + target_link_libraries(libjustlm_gpt2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gpt2/libnc.so pthread) +elseif (LM_BACKEND STREQUAL "llama.cpp") + add_subdirectory(llama.cpp) + add_library(libjustlm_llama STATIC libjustlm_llama.cpp) + target_link_libraries(libjustlm_llama PRIVATE llama) +else() + message(FATAL_ERROR "LM_BACKEND '${LM_BACKEND}' is unsupported. Please use either 'libnc gpt2' or 'llama.cpp'.") +endif() diff --git a/gpt2/VERSION b/gpt2/VERSION new file mode 100644 index 0000000..da4ce28 --- /dev/null +++ b/gpt2/VERSION @@ -0,0 +1 @@ +2021-04-24 diff --git a/gpt2/arith.c b/gpt2/arith.c new file mode 100644 index 0000000..79d0148 --- /dev/null +++ b/gpt2/arith.c @@ -0,0 +1,301 @@ +/* + * Arithmetic coder + * + * Copyright (c) 2018-2021 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cutils.h" +#include "arith.h" + +#define RANGE_MIN_BITS 16 +#define RANGE_MIN ((0xff << (RANGE_MIN_BITS - 8)) + 1) +#define RANGE_MAX (0xff << RANGE_MIN_BITS) + +//#define DUMP_PUT_BIT +//#define DUMP_GET_BIT + +void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size, + PutBitWriteFunc *write_func, void *opaque) +{ + s->low = 0; + s->range = RANGE_MAX; + s->current_byte = 0xff; + s->n_bytes = 0; + s->buf = buf; + s->buf_size = buf_size; + s->idx = 0; + s->write_func = write_func; + s->opaque = opaque; + s->byte_count = 0; + assert(PROB_UNIT <= RANGE_MIN); +} + +static void put_byte(PutBitState *s, int v) +{ + s->buf[s->idx++] = v; + if (unlikely(s->idx == s->buf_size)) { + s->byte_count += s->idx; + s->write_func(s->opaque, s->buf, s->idx); + s->idx = 0; + } +} + +/* 0 <= v <= 0x1fe. The current output stream contains n_bytes with: + current_byte, then (n_bytes - 1) x 0xff + */ +static void put_val(PutBitState *s, int v) +{ + uint32_t carry, b; + +#ifdef DUMP_PUT_BIT + printf(" out=%d\n", v); +#endif + if (v == 0xff) { + s->n_bytes++; + } else { + if (s->n_bytes > 0) { + carry = v >> 8; + put_byte(s, s->current_byte + carry); + b = (0xff + carry) & 0xff; + while (s->n_bytes > 1) { + put_byte(s, b); + s->n_bytes--; + } + } + s->n_bytes = 1; + s->current_byte = v; + } +} + +static void put_val_flush(PutBitState *s) +{ + if (s->n_bytes > 0) { + put_val(s, 0); + } +} + +static void put_bit_renorm(PutBitState *s) +{ + uint32_t v; + /* after renormalisation: + 0 <= low <= RANGE_MAX + RANGE_MIN <= range <= RANGE_MAX + In the worst case before normalisation: + low_max = 2 * RANGE_MAX hence v <= 0x1fe + */ + while (s->range < RANGE_MIN) { + v = s->low >> RANGE_MIN_BITS; + put_val(s, v); + s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8; + s->range <<= 8; + } +} + +/* 0 < prob0 < PROB_UNIT */ +void put_bit(PutBitState *s, int prob0, int bit) +{ + int range0; + + assert(s->range >= RANGE_MIN); + range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS; + assert(range0 > 0); + assert(range0 < s->range); +#if defined(DUMP_PUT_BIT) + { + static int count; + printf("%d: range=%d b=%d range0=%d low=%d\n", + count++, s->range, bit, range0, s->low); + } +#endif + if (!bit) { + s->range = range0; + } else { + s->low += range0; + s->range -= range0; + } + + put_bit_renorm(s); +} + +void put_bit_raw(PutBitState *s, int bit) +{ + int range0; + + assert(s->range >= RANGE_MIN); + range0 = s->range >> 1; + if (!bit) { + s->range = range0; + } else { + s->low += range0; + s->range -= range0; + } + + put_bit_renorm(s); +} + +/* return the minimum number of bits to be able to correctly decode */ +int64_t put_bit_flush(PutBitState *s) +{ + int n, val, mask; + + /* force larger range */ + if (s->range < (1 << RANGE_MIN_BITS)) { + put_val(s, s->low >> RANGE_MIN_BITS); + s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8; + s->range <<= 8; + } + + /* largest n such as 2^n <= range */ + n = 0; + while ((1 << (n + 1)) <= s->range) + n++; + assert(n >= RANGE_MIN_BITS && n <= (RANGE_MIN_BITS + 7)); + + val = s->low; + mask = (1 << n) - 1; + if ((val & mask) != 0) + val = (val + (1 << n)) & ~mask; + assert(val >= s->low && val < s->low + s->range); + + put_val(s, val >> RANGE_MIN_BITS); + put_val_flush(s); + if (s->idx > 0) { + s->byte_count += s->idx; + s->write_func(s->opaque, s->buf, s->idx); + s->idx = 0; + } + return (s->byte_count - 1) * 8 + (RANGE_MIN_BITS + 8 - n); +} + +/* return the approximate number of written bits */ +int64_t put_bit_get_bit_count(PutBitState *s) +{ + int n; + n = 0; + while ((1 << (n + 1)) <= s->range) + n++; + return (s->byte_count + s->idx) * 8 + (RANGE_MIN_BITS + 7 - n); +} + +/****************************************/ + +static void refill(GetBitState *s) +{ + s->range <<= 8; + s->low <<= 8; + if (s->idx >= s->buf_len) { + if (!s->read_func) + return; /* pad with zeros */ + s->buf_len = s->read_func(s->opaque, s->buf, s->buf_size); + s->byte_count += s->buf_len; + s->idx = 0; + } +#ifdef DUMP_GET_BIT + printf(" in=%d\n", s->buf[s->idx]); +#endif + s->low += s->buf[s->idx++]; +} + +void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size, + GetBitReadFunc *read_func, void *opaque) +{ + int i; + s->buf_size = buf_size; + s->buf = buf; + s->read_func = read_func; + s->opaque = opaque; + if (read_func) { + s->buf_len = 0; + } else { + /* prefilled buffer */ + s->buf_len = s->buf_size; + } + s->byte_count = s->buf_len; + s->range = 0; + s->low = 0; + s->idx = 0; + for(i = 0; i <= RANGE_MIN_BITS; i += 8) { + refill(s); + } + s->range = RANGE_MAX; +} + +/* 0 < prob0 < PROB_UNIT */ +int get_bit(GetBitState *s, int prob0) +{ + int b, range0; + + assert(s->range >= RANGE_MIN); + range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS; + assert(range0 > 0); + assert(range0 < s->range); + b = s->low >= range0; +#ifdef DUMP_GET_BIT + { + static int count; + printf("%d: range=%d b=%d range0=%d low=%d\n", count++, s->range, b, range0, s->low); + } +#endif + if (b) { + s->low -= range0; + s->range -= range0; + } else { + s->range = range0; + } + while (s->range < RANGE_MIN) + refill(s); + return b; +} + +/* no context */ +int get_bit_raw(GetBitState *s) +{ + int b, range0; + range0 = s->range >> 1; + b = s->low >= range0; + if (b) { + s->low -= range0; + s->range -= range0; + } else { + s->range = range0; + } + if (s->range < RANGE_MIN) + refill(s); + return b; +} + +/* return the approximate number of read bits */ +int64_t get_bit_get_bit_count(GetBitState *s) +{ + int n; + n = 0; + while ((1 << (n + 1)) <= s->range) + n++; + return (s->byte_count - s->buf_len + s->idx) * 8 - n; +} diff --git a/gpt2/arith.h b/gpt2/arith.h new file mode 100644 index 0000000..d1a4e31 --- /dev/null +++ b/gpt2/arith.h @@ -0,0 +1,73 @@ +/* + * Arithmetic coder + * + * Copyright (c) 2018-2019 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef ARITH_H +#define ARITH_H + +#define PROB_UNIT_BITS 15 +#define PROB_UNIT (1 << PROB_UNIT_BITS) + +typedef void PutBitWriteFunc(void *opaque, const uint8_t *buf, size_t buf_size); + +typedef struct { + uint32_t range; + uint32_t low; + uint8_t current_byte; + uint32_t n_bytes; + uint8_t *buf; + size_t buf_size; + size_t idx; /* current position in bytes */ + PutBitWriteFunc *write_func; + void *opaque; + uint64_t byte_count; +} PutBitState; + +void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size, + PutBitWriteFunc *write_func, void *opaque); +void put_bit(PutBitState *s, int prob0, int bit); +void put_bit_raw(PutBitState *s, int bit); +int64_t put_bit_flush(PutBitState *s); +int64_t put_bit_get_bit_count(PutBitState *s); + +/* return the number of read bytes */ +typedef ssize_t GetBitReadFunc(void *opaque, uint8_t *buf, size_t buf_size); + +typedef struct { + uint8_t *buf; + int buf_len; + int buf_size; + int idx; + uint32_t low; + uint32_t range; + GetBitReadFunc *read_func; + void *opaque; + uint64_t byte_count; +} GetBitState; + +void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size, + GetBitReadFunc *read_func, void *opaque); +int get_bit(GetBitState *s, int prob0); +int get_bit_raw(GetBitState *s); +int64_t get_bit_get_bit_count(GetBitState *s); + +#endif /* ARITH_H */ diff --git a/gpt2/cp_utils.c b/gpt2/cp_utils.c new file mode 100644 index 0000000..d049d92 --- /dev/null +++ b/gpt2/cp_utils.c @@ -0,0 +1,316 @@ +/* + * Compression utilities + * + * Copyright (c) 2018-2019 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#endif + +#include "cutils.h" +#include "libnc.h" +#include "cp_utils.h" + +void fatal_error(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "Fatal error: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + exit(1); +} + +int64_t get_time_ms(void) +{ +#ifdef _WIN32 + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000 + (tv.tv_usec / 1000U); +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec * 1000 + (ts.tv_nsec / 1000000U); +#endif +} + +void fput_u8(FILE *f, uint8_t v) +{ + fputc(v, f); +} + +int fget_u8(FILE *f, uint8_t *pv) +{ + int c; + c = fgetc(f); + if (c < 0) + return -1; + *pv = c; + return 0; +} + +void fput_be16(FILE *f, uint16_t v) +{ + fputc(v >> 8, f); + fputc(v >> 0, f); +} + +int fget_be16(FILE *f, uint16_t *pv) +{ + uint8_t buf[2]; + if (fread(buf, 1, sizeof(buf), f) != sizeof(buf)) + return -1; + *pv = (buf[0] << 8) | + (buf[1] << 0); + return 0; +} + +void fput_be32(FILE *f, uint32_t v) +{ + fputc(v >> 24, f); + fputc(v >> 16, f); + fputc(v >> 8, f); + fputc(v >> 0, f); +} + +int fget_be32(FILE *f, uint32_t *pv) +{ + uint8_t buf[4]; + if (fread(buf, 1, sizeof(buf), f) != sizeof(buf)) + return -1; + *pv = (buf[0] << 24) | + (buf[1] << 16) | + (buf[2] << 8) | + (buf[3] << 0); + return 0; +} + +void fput_sgd_opt(FILE *f, const SGDOptParams *p) +{ + fput_u8(f, p->algo); + switch(p->algo) { + case SGD_OPT_BASIC: + break; + case SGD_OPT_ADAM: + fput_f32(f, p->u.adam.beta1); + fput_f32(f, p->u.adam.beta2); + fput_f32(f, p->u.adam.eps); + fput_f32(f, p->u.adam.gradient_clip); + break; + default: + abort(); + } +} + +int fget_sgd_opt(FILE *f, SGDOptParams *p) +{ + uint8_t v8; + + if (fget_u8(f, &v8)) + return -1; + p->algo = v8; + switch(p->algo) { + case SGD_OPT_BASIC: + break; + case SGD_OPT_ADAM: + if (fget_f32(f, &p->u.adam.beta1)) + return -1; + if (fget_f32(f, &p->u.adam.beta2)) + return -1; + if (fget_f32(f, &p->u.adam.eps)) + return -1; + if (fget_f32(f, &p->u.adam.gradient_clip)) + return -1; + break; + default: + return -1; + } + return 0; +} + +void dump_sgd_opt_params(FILE *f, const SGDOptParams *p) +{ + switch(p->algo) { + case SGD_OPT_BASIC: + fprintf(f, " sgd_opt=%s", + "none"); + break; + case SGD_OPT_ADAM: + fprintf(f, " sgd_opt=%s beta1=%g beta2=%g eps=%g gclip=%g", + "adam", + p->u.adam.beta1, + p->u.adam.beta2, + p->u.adam.eps, + p->u.adam.gradient_clip); + break; + default: + abort(); + } +} + +typedef union { + float f; + uint32_t u32; +} f32; + +void fput_f32(FILE *f, float v) +{ + f32 u; + u.f = v; + fput_be32(f, u.u32); +} + +int fget_f32(FILE *f, float *pv) +{ + f32 u; + if (fget_be32(f, &u.u32)) + return -1; + *pv = u.f; + return 0; +} + +void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym) +{ + int start, range, prob0, bit, range0; + float p, p0; + + start = 0; + range = n_symb; + p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */ + while (range > 1) { + range0 = range >> 1; + p0 = vec_sum_f32(prob_table + start, range0); + prob0 = lrintf(p0 * PROB_UNIT / p); + prob0 = clamp_int(prob0, 1, PROB_UNIT - 1); + bit = sym >= (start + range0); + put_bit(pb, prob0, bit); + if (bit) { + start += range0; + range = range - range0; + p = p - p0; + } else { + p = p0; + range = range0; + } + } +} + +int read_sym(GetBitState *gb, const float *prob_table, int n_symb) +{ + int start, range, prob0, bit, range0; + float p, p0; + + start = 0; + range = n_symb; + p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */ + while (range > 1) { + range0 = range >> 1; + p0 = vec_sum_f32(prob_table + start, range0); + prob0 = lrintf(p0 * PROB_UNIT / p); + prob0 = clamp_int(prob0, 1, PROB_UNIT - 1); + bit = get_bit(gb, prob0); + if (bit) { + start += range0; + range = range - range0; + p = p - p0; + } else { + p = p0; + range = range0; + } + } + return start; +} + +void create_debug_dir(char *debug_dir, size_t debug_dir_size, + const char *debug_path, const char *prefix) +{ + char name1[1024]; + struct tm *tm; + time_t ti; + + snprintf(name1, sizeof(name1), "%s/%s", debug_path, prefix); +#ifdef _WIN32 + _mkdir(name1); +#else + mkdir(name1, 0777); +#endif + + ti = time(NULL); + tm = localtime(&ti); + snprintf(debug_dir, debug_dir_size, "%s/%04u%02u%02u-%02u%02u%02u", + name1, + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); +#ifdef _WIN32 + _mkdir(debug_dir); +#else + mkdir(debug_dir, 0777); +#endif +} + +/* we print at least 3 significant digits with at most 5 chars, except + if larger than 9999T. The value is rounded to zero. */ +char *get_si_prefix(char *buf, int buf_size, uint64_t val) +{ + static const char suffixes[4] = "kMGT"; + uint64_t base; + int i; + + if (val <= 999) { + snprintf(buf, buf_size, "%" PRId64, val); + } else { + base = 1000; + for(i=0;i<4;i++) { + /* Note: we round to 0 */ + if (val < base * 10) { + snprintf(buf, buf_size, "%0.2f%c", + floor((val * 100.0) / base) / 100.0, + suffixes[i]); + break; + } else if (val < base * 100) { + snprintf(buf, buf_size, "%0.1f%c", + floor((val * 10.0) / base) / 10.0, + suffixes[i]); + break; + } else if (val < base * 1000 || (i == 3)) { + snprintf(buf, buf_size, + "%" PRId64 "%c", + val / base, + suffixes[i]); + break; + } + base = base * 1000; + } + } + return buf; +} diff --git a/gpt2/cp_utils.h b/gpt2/cp_utils.h new file mode 100644 index 0000000..74deaa0 --- /dev/null +++ b/gpt2/cp_utils.h @@ -0,0 +1,48 @@ +/* + * Compression utilities + * + * Copyright (c) 2018-2019 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "arith.h" +#include "libnc.h" + +void __attribute__((noreturn, format(printf, 1, 2))) fatal_error(const char *fmt, ...); + +int64_t get_time_ms(void); +void fput_u8(FILE *f, uint8_t v); +int fget_u8(FILE *f, uint8_t *pv); +void fput_be16(FILE *f, uint16_t v); +int fget_be16(FILE *f, uint16_t *pv); +void fput_be32(FILE *f, uint32_t v); +int fget_be32(FILE *f, uint32_t *pv); +void fput_f32(FILE *f, float v); +int fget_f32(FILE *f, float *pv); +void fput_sgd_opt(FILE *f, const SGDOptParams *p); +int fget_sgd_opt(FILE *f, SGDOptParams *p); +void dump_sgd_opt_params(FILE *f, const SGDOptParams *p); + +void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym); +int read_sym(GetBitState *gb, const float *prob_table, int n_symb); + +void create_debug_dir(char *debug_dir, size_t debug_dir_size, + const char *debug_path, const char *prefix); +char *get_si_prefix(char *buf, int buf_size, uint64_t val); + diff --git a/gpt2/cutils.h b/gpt2/cutils.h new file mode 100644 index 0000000..68c1df2 --- /dev/null +++ b/gpt2/cutils.h @@ -0,0 +1,152 @@ +#ifndef CUTILS_H +#define CUTILS_H + +#include + +#define force_inline inline __attribute__((always_inline)) +#define no_inline __attribute__((noinline)) +#define __unused __attribute__((unused)) +#define xglue(x, y) x ## y +#define glue(x, y) xglue(x, y) +#ifndef offsetof +#define offsetof(type, field) ((size_t) &((type *)0)->field) +#endif +#define countof(x) (sizeof(x) / sizeof(x[0])) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +typedef int BOOL; + +#ifndef FALSE +enum { + FALSE = 0, + TRUE = 1, +}; +#endif + +typedef struct { + uint16_t u16; +} bfloat16_t; + +#if defined(__x86_64__) +static inline int64_t get_cycles(void) +{ + uint32_t low,high; + int64_t val; + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + val = high; + val <<= 32; + val |= low; + return val; +} +#else +static inline int64_t get_cycles(void) +{ + int64_t val; + asm volatile ("rdtsc" : "=A" (val)); + return val; +} +#endif + +static inline int max_int(int a, int b) +{ + if (a > b) + return a; + else + return b; +} + +static inline int min_int(int a, int b) +{ + if (a < b) + return a; + else + return b; +} + +static inline size_t max_size_t(size_t a, size_t b) +{ + if (a > b) + return a; + else + return b; +} + +static inline size_t min_size_t(size_t a, size_t b) +{ + if (a < b) + return a; + else + return b; +} + +static inline ssize_t max_ssize_t(ssize_t a, ssize_t b) +{ + if (a > b) + return a; + else + return b; +} + +static inline ssize_t min_ssize_t(ssize_t a, ssize_t b) +{ + if (a < b) + return a; + else + return b; +} + +static inline int clamp_int(int val, int min_val, int max_val) +{ + if (val < min_val) + return min_val; + else if (val > max_val) + return max_val; + else + return val; +} + +static inline float clamp_float(float val, float min_val, float max_val) +{ + if (val < min_val) + return min_val; + else if (val > max_val) + return max_val; + else + return val; +} + +/* WARNING: undefined if a = 0 */ +static inline int clz32(unsigned int a) +{ + return __builtin_clz(a); +} + +/* WARNING: undefined if a = 0 */ +static inline int clz64(uint64_t a) +{ + return __builtin_clzll(a); +} + +static inline int floor_log2(uint64_t a) +{ + return 63 - clz64(a); +} + +static inline int ceil_log2(uint64_t a) +{ + if (a <= 1) + return 0; + else + return 64 - clz64(a - 1); +} + +static inline float squaref(float x) +{ + return x * x; +} + +#define DUP8(a) a, a, a, a, a, a, a, a + +#endif /* CUTILS_H */ + diff --git a/gpt2/gpt2tc.c b/gpt2/gpt2tc.c new file mode 100644 index 0000000..3ffd4ae --- /dev/null +++ b/gpt2/gpt2tc.c @@ -0,0 +1,2023 @@ +/* + * Text Completion with GPT-2 Transformer + * + * Copyright (c) 2019-2021 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cutils.h" +#include "arith.h" +#include "libnc.h" +#include "cp_utils.h" +#include "list.h" +#include "gpt2tc.h" + + +/************************************************/ +/* Transformer model */ + +static int nb_threads = 1; + +/* [seg_len, d_model] -> + [n_head, seg_len, d_model/n_head] */ +static NCTensor *split_head(NCTensor *x, int n_head) +{ + const size_t *dims; + int n_dims, axis[3]; + + dims = nc_tensor_get_dims(x, &n_dims); + assert(n_dims == 2); + assert((dims[0] % n_head) == 0); + x = nc_reshape_3d(x, dims[0] / n_head, n_head, dims[1]); + /* [seg_len, n_head, d_model/n_head] */ + axis[0] = 0; + axis[1] = 2; + axis[2] = 1; + return nc_permute(x, 3, axis); +} + +/* [n_head, seg_len, d_value] + -> [seg_len, d_value * n_head] */ +static NCTensor *concat_head(NCTensor *x) +{ + const size_t *dims; + int n_dims, axis[3]; + + axis[0] = 0; + axis[1] = 2; + axis[2] = 1; + x = nc_permute(x, 3, axis); + dims = nc_tensor_get_dims(x, &n_dims); + assert(n_dims == 3); + /* [seg_len, n_head, d_value] */ + return nc_reshape_2d(x, dims[0] * dims[1], dims[2]); +} + +#define MAT_STRIDE 64 + +/* convert the matrix to strided representation */ +static void convert_mat(NCTensor **pw) +{ + NCTensor *w; + int m, n, n_dims, r; + const size_t *dims; + int axis[3]; + + w = *pw; + dims = nc_tensor_get_dims(w, &n_dims); + assert(n_dims == 2); + m = dims[0]; + n = dims[1]; + r = (-m) % MAT_STRIDE; + if (r < 0) + r += MAT_STRIDE; + w = nc_pad(w, 0, NC_PAD_ZERO, r, NC_PAD_ZERO); + w = nc_reshape_3d(w, MAT_STRIDE, (m + MAT_STRIDE - 1) / MAT_STRIDE, n); + axis[0] = 0; + axis[1] = 2; + axis[2] = 1; + w = nc_permute(w, 3, axis); + *pw = w; +} + +static TransformerModel *trf_init(const TransformerModelParams *p, + const char *coefs_filename) +{ + TransformerModel *s; + NCContext *m; + NCDevice *d; + int layer_idx; + TransformerLayer *layers, *tl; + + s = nc_mallocz(sizeof(*s)); + rnd_init(&s->rnd_state, p->seed); + s->n_layer = p->n_layer; + s->d_model = p->d_model; + s->n_head = p->n_head; + s->d_key = p->d_key; + s->d_value = p->d_value; + s->d_inner = p->d_inner; + s->n_ctx = p->n_ctx; + s->n_symbols = p->n_symbols; + + m = nc_context_init(nb_threads); + s->model = m; + d = nc_new_cpu_device(m); + s->device = d; + + nc_param_list_init(&s->param_list); + /* disable graph for the parameters */ + nc_param_list_set_graph(&s->param_list, FALSE); + + layers = nc_mallocz(sizeof(layers[0]) * s->n_layer); + s->layers = layers; + for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) { + tl = &layers[layer_idx]; + tl->ln_1_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->ln_1_g, "h%d/ln_1/g", layer_idx); + + tl->ln_1_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->ln_1_b, "h%d/ln_1/b", layer_idx); + + tl->attn_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->n_head * s->d_key * 3, + s->d_model); + nc_new_param(&s->param_list, &tl->attn_w, + "h%d/attn/c_attn/w", layer_idx); + + tl->attn_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->n_head * s->d_key * 3); + nc_new_param(&s->param_list, &tl->attn_b, + "h%d/attn/c_attn/b", layer_idx); + + tl->attn_proj_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model, + s->n_head * s->d_value); + nc_new_param(&s->param_list, &tl->attn_proj_w, + "h%d/attn/c_proj/w", layer_idx); + + tl->attn_proj_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->attn_proj_b, + "h%d/attn/c_proj/b", layer_idx); + + tl->ln_2_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->ln_2_g, "h%d/ln_2/g", layer_idx); + + tl->ln_2_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->ln_2_b, "h%d/ln_2/b", layer_idx); + + tl->mlp_fc_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_inner, + s->d_model); + nc_new_param(&s->param_list, &tl->mlp_fc_w, + "h%d/mlp/c_fc/w", layer_idx); + + tl->mlp_fc_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_inner); + nc_new_param(&s->param_list, &tl->mlp_fc_b, + "h%d/mlp/c_fc/b", layer_idx); + + tl->mlp_proj_w = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model, + s->d_inner); + nc_new_param(&s->param_list, &tl->mlp_proj_w, + "h%d/mlp/c_proj/w", layer_idx); + + tl->mlp_proj_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &tl->mlp_proj_b, + "h%d/mlp/c_proj/b", layer_idx); + } + + s->ln_f_g = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &s->ln_f_g, "ln_f/g"); + + s->ln_f_b = nc_new_tensor_1d(d, NC_TYPE_F32, s->d_model); + nc_new_param(&s->param_list, &s->ln_f_b, "ln_f/b"); + + s->wte = nc_new_tensor_2d(d, NC_TYPE_F16, s->d_model, + s->n_symbols); + nc_new_param(&s->param_list, &s->wte, "wte"); + + s->wpe = nc_new_tensor_2d(d, NC_TYPE_F32, s->d_model, + s->n_ctx); + nc_new_param(&s->param_list, &s->wpe, "wpe"); + + nc_load_coefs(&s->param_list, coefs_filename); + + /* optimize the variable storage */ + s->wte_trans = nc_transpose(nc_dup_tensor(s->wte)); + + convert_mat(&s->wte_trans); + + for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) { + tl = &layers[layer_idx]; + convert_mat(&tl->attn_w); + convert_mat(&tl->attn_proj_w); + convert_mat(&tl->mlp_fc_w); + convert_mat(&tl->mlp_proj_w); + } + return s; +} + +typedef struct { + int mem_len; + NCTensor **mem_k; + NCTensor **mem_v; +} BatchEntry; + +/* dimensions: output[train_len * n_streams][n_symbols], + input[train_len * n_streams], tab_mem[n_streams], mem_k[n_layer] + mem_v[n_layer]. */ +static NCTensor *trf_eval(TransformerModel *s, int train_len, + int n_streams, BatchEntry *tab_mem, + NCTensor *input) +{ + NCTensor *layer_input, **tab_tmp, *output, *position; + TransformerLayer *tl; + int layer_idx, i, j, *ptr; + BatchEntry *be; + + tab_tmp = nc_mallocz(sizeof(tab_tmp[0]) * + max_int(max_int(3, train_len), + max_int(s->n_head, s->n_layer))); + + position = nc_new_tensor_1d(s->device, NC_TYPE_I32, + train_len * n_streams); + ptr = nc_tensor_get_ptr(position, NULL); + for(i = 0; i < train_len; i++) { + for(j = 0; j < n_streams; j++) { + ptr[i * n_streams + j] = tab_mem[j].mem_len + i; + } + } + + layer_input = nc_get_col(nc_dup_tensor(s->wte), input); + layer_input = nc_convert(layer_input, NC_TYPE_F32); + layer_input = nc_add(layer_input, nc_get_col(nc_dup_tensor(s->wpe), + position)); + + for(layer_idx = 0; layer_idx < s->n_layer; layer_idx++) { + NCTensor *query, *key, *value, *ff_input, *t0, **tab_tmp2; + + tl = &s->layers[layer_idx]; + + t0 = nc_add(nc_mul(nc_layer_norm(nc_dup_tensor(layer_input), 1e-5), + nc_dup_tensor(tl->ln_1_g)), + nc_dup_tensor(tl->ln_1_b)); + + t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->attn_w), t0), + nc_dup_tensor(tl->attn_b)); + tab_tmp2 = nc_mallocz(sizeof(tab_tmp2[0]) * n_streams); + + /* [ train_len * n_streams d_model * 3] -> + n_streams * [ train_len d_model * 3] */ + nc_hsplit(tab_tmp2, t0, n_streams, NULL); + for(i = 0; i < n_streams; i++) { + be = &tab_mem[i]; + + t0 = tab_tmp2[i]; + nc_vsplit(tab_tmp, t0, 3, NULL); + query = tab_tmp[0]; + key = tab_tmp[1]; + value = tab_tmp[2]; + + /* split query, key and value for each head */ + key = split_head(key, s->n_head); + query = split_head(query, s->n_head); + value = split_head(value, s->n_head); + + /* save the key and value to the memory */ + t0 = nc_slice_alias(be->mem_k[layer_idx], + 1, be->mem_len, be->mem_len + train_len); + nc_tensor_copy(t0, key); + nc_free_tensor(t0); + nc_free_tensor(key); + + t0 = nc_slice_alias(be->mem_v[layer_idx], + 1, be->mem_len, be->mem_len + train_len); + nc_tensor_copy(t0, value); + nc_free_tensor(t0); + nc_free_tensor(value); + + key = nc_slice_alias(be->mem_k[layer_idx], + 1, 0, be->mem_len + train_len); + value = nc_slice_alias(be->mem_v[layer_idx], + 1, 0, be->mem_len + train_len); + + /* cross product term */ + t0 = nc_matmul_add(key, query, NULL, + TRUE, FALSE); + t0 = nc_mul(t0, nc_new_f32(s->device, 1.0f / sqrtf(s->d_key))); + + /* set the future cross products to -infinity so that they + don't change the softmax result */ + t0 = nc_slt_mat_set(t0, be->mem_len + 1, -INFINITY); + + t0 = nc_soft_max(t0); + t0 = nc_matmul(value, t0); + + /* merge all the heads */ + tab_tmp2[i] = concat_head(t0); + } + + t0 = nc_hconcat(tab_tmp2, n_streams); + nc_free(tab_tmp2); + + /* projection */ + t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->attn_proj_w), t0), + nc_dup_tensor(tl->attn_proj_b)); + + t0 = nc_add(t0, layer_input); + + ff_input = nc_dup_tensor(t0); + + t0 = nc_add(nc_mul(nc_layer_norm(t0, 1e-5), + nc_dup_tensor(tl->ln_2_g)), + nc_dup_tensor(tl->ln_2_b)); + + t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->mlp_fc_w), t0), + nc_dup_tensor(tl->mlp_fc_b)); + t0 = nc_gelu(t0); + + t0 = nc_add(nc_matmul_stride(nc_dup_tensor(tl->mlp_proj_w), t0), + nc_dup_tensor(tl->mlp_proj_b)); + + layer_input = nc_add(t0, ff_input); + } + + { + NCTensor *t0; + t0 = nc_add(nc_mul(nc_layer_norm(layer_input, 1e-5), + nc_dup_tensor(s->ln_f_g)), + nc_dup_tensor(s->ln_f_b)); + + t0 = nc_matmul_stride(nc_dup_tensor(s->wte_trans), t0); + /* need to resize the output to the exact size because the + strided matrix is larger */ + output = nc_resize(t0, s->n_symbols); + } + nc_free(tab_tmp); + return output; +} + +static void trf_end(TransformerModel *s) +{ + nc_free_tensor(s->wte_trans); + + nc_param_list_end(&s->param_list); + nc_free(s->layers); + nc_context_end(s->model); + nc_free(s); +} + +static const char *gpt2_model_name[] = { "117M", "345M", "774M", "1558M" }; + +GPT2ModelEnum parse_model(const char *str) +{ + int i; + for(i = 0; i < countof(gpt2_model_name); i++) { + if (!strcmp(gpt2_model_name[i], str)) + return i; + } + return (GPT2ModelEnum)-1; +} + +void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model) +{ + memset(p, 0, sizeof(*p)); + p->seed = 123; + switch(model) { + case GPT2_MODEL_117M: + p->n_layer = 12; + p->d_model = 768; + break; + case GPT2_MODEL_345M: + p->n_layer = 24; + p->d_model = 1024; + break; + case GPT2_MODEL_774M: + p->n_layer = 36; + p->d_model = 1280; + break; + case GPT2_MODEL_1558M: + p->n_layer = 48; + p->d_model = 1600; + break; + default: + abort(); + } + p->d_key = 64; + p->n_head = p->d_model / p->d_key; + p->d_value = p->d_key; + p->d_inner = p->d_model * 4; + p->n_ctx = 1024; + p->n_symbols = 50257; +} + +typedef uint16_t DataSymbol; + +/****************************************************************/ +/* preprocessor */ + +static uint32_t hash_calc(const uint8_t *buf, int len, int n_bits) +{ + uint32_t h; + int i; + + h = 1; + for(i = 0; i < len; i++) { + h = h * 263 + buf[i]; + } + return h & ((1 << n_bits) - 1); +} + +static void hash_resize(WordList *s, int hash_bits) +{ + int i, h; + Word *p; + + s->hash_bits = hash_bits; + s->hash_size = 1 << hash_bits; + free(s->hash_table); + s->hash_table = malloc(sizeof(s->hash_table[0]) * s->hash_size); + for(i = 0; i < s->hash_size; i++) + s->hash_table[i] = -1; + for(i = 0; i < s->word_count; i++) { + p = &s->words[i]; + h = hash_calc(p->buf, p->len, s->hash_bits); + p->next = s->hash_table[h]; + s->hash_table[h] = i; + } +} + +static WordList *word_list_init(void) +{ + WordList *s; + + s = malloc(sizeof(WordList)); + memset(s, 0, sizeof(*s)); + s->word_count = 0; + s->word_size = 0; + hash_resize(s, 12); + return s; +} + +static void word_list_end(WordList *s) +{ + int i; + Word *p; + + for(i = 0; i < s->word_count; i++) { + p = &s->words[i]; + free(p->buf); + } + free(s->words); + free(s->hash_table); + free(s); +} + +static int64_t hash_lookup_count; +static int64_t hash_it_count; + +/* the hash size contains HASH_SIZE_FACTOR times more entries */ +#define HASH_SIZE_FACTOR 2 + +static Word *word_find_add(WordList *s, const uint8_t *buf, int len, int add) +{ + uint32_t h, idx; + Word *p; + + h = hash_calc(buf, len, s->hash_bits); + idx = s->hash_table[h]; + hash_lookup_count++; + while (idx != -1) { + hash_it_count++; + p = &s->words[idx]; + if (p->len == len && !memcmp(p->buf, buf, len)) + return p; + idx = p->next; + } + + if (!add) + return NULL; + + if (s->word_count >= s->word_size) { + size_t new_size = s->word_size + s->word_size / 2; + if (new_size < 32) + new_size = 32; + if (s->word_count + 1 > new_size) + new_size = s->word_count + 1; + s->words = realloc(s->words, new_size * sizeof(s->words[0])); + s->word_size = new_size; + + } + /* resize the hash table when needed */ + if ((s->word_count * HASH_SIZE_FACTOR) > s->hash_size) { + int hash_bits = s->hash_bits; + while ((s->word_count * HASH_SIZE_FACTOR) > (1 << hash_bits)) + hash_bits++; + hash_resize(s, hash_bits); + + /* recompute the hash with the new hash table size */ + h = hash_calc(buf, len, s->hash_bits); + } + + idx = s->word_count++; + p = &s->words[idx]; + p->len = len; + p->buf = malloc(len + 1); + memcpy(p->buf, buf, len); + p->buf[len] = 0; + p->next = s->hash_table[h]; + s->hash_table[h] = idx; + return p; +} + +static void word_load(WordList *s, const char *filename) +{ + FILE *f; + uint8_t buf[1024]; + int len, c; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + len = 0; + for(;;) { + c = fgetc(f); + if (c < 0) + break; + if (c == '\n') { + if (len > 0) { + word_find_add(s, buf, len, TRUE); + } + len = 0; + } else { + if (c == '\\') { + c = fgetc(f); + if (c < 0) + break; + if (c == 'n') { + c = '\n'; + } else if (c != '\\') { + fprintf(stderr, "Invalid escape\n"); + exit(1); + } + } + if (len >= sizeof(buf)) { + fprintf(stderr, "Word too long\n"); + exit(1); + } + buf[len++] = c; + } + } + fclose(f); +} + +typedef enum { + CAT_SPACE, + CAT_LETTER, + CAT_NUMBER, + CAT_OTHER, +} CharCatEnum; + +static int get_char_cat(int c) +{ + if (c == ' ') { + return CAT_SPACE; + } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + c >= 128) { + return CAT_LETTER; + } else if (c >= '0' && c <= '9') { + return CAT_NUMBER; + } else { + return CAT_OTHER; + } +} + +static BOOL match(size_t *pmatch_len, + const uint8_t *buf, size_t buf_len, const char *str) +{ + size_t len; + len = strlen(str); + if (len <= buf_len && !memcmp(buf, str, len)) { + *pmatch_len = len; + return TRUE; + } else { + *pmatch_len = 0; + return FALSE; + } +} + +static size_t gpt2_get_word(const uint8_t *buf, size_t buf_len) +{ + size_t len, p; + int cat; + + if (buf_len == 0) + return 0; + if (buf[0] == '\'' && + (match(&len, buf, buf_len, "'s") || + match(&len, buf, buf_len, "'t") || + match(&len, buf, buf_len, "'re") || + match(&len, buf, buf_len, "'ve") || + match(&len, buf, buf_len, "'m") || + match(&len, buf, buf_len, "'ll") || + match(&len, buf, buf_len, "'d"))) { + return len; + } + p = 0; + if (buf[0] == ' ' && buf_len >= 2) + p++; + if (buf[p] != ' ') { + cat = get_char_cat(buf[p]); + len = 1 + p; + while (len < buf_len && get_char_cat(buf[len]) == cat) + len++; + return len; + } else { + return 1; + } +} + +static __unused void print_word(const uint8_t *buf, size_t len) +{ + size_t i; + int c; + for(i = 0; i < len; i++) { + c = buf[i]; + if (c >= ' ' && c <= '~') + putchar(c); + else + printf("\\x%02x", c); + } +} + +void gpt2_pp_encode(const char *word_filename, + const char *in_filename, const char *out_filename) +{ + FILE *f, *fo; + size_t buf_size, buf_pos, word_len, len, i; + uint8_t *buf; + WordList *s; + Word *p; + + f = fopen(in_filename, "rb"); + if (!f) { + perror(in_filename); + exit(1); + } + + fseek(f, 0, SEEK_END); + buf_size = ftell(f); + fseek(f, 0, SEEK_SET); + buf = malloc(buf_size * sizeof(buf[0])); + fread(buf, 1, buf_size, f); + fclose(f); + + s = word_list_init(); + word_load(s, word_filename); + + fo = fopen(out_filename, "wb"); + if (!fo) { + perror(out_filename); + exit(1); + } + + for(buf_pos = 0; buf_pos < buf_size; buf_pos += word_len) { + word_len = gpt2_get_word(buf + buf_pos, buf_size - buf_pos); +#if 0 + print_word(buf + buf_pos, word_len); + printf("\n"); +#endif + /* find the longest word(s) */ + for(i = 0; i < word_len; i += len) { + for(len = word_len - i; len >= 1; len--) { + p = word_find_add(s, buf + buf_pos + i, len, FALSE); + if (p) + break; + } + assert(len >= 1); + fput_be16(fo, p - s->words); + } + } + + free(buf); + + fclose(fo); + + word_list_end(s); +} + +#define SYMB_EOT 50256 + +static void add_char(DataSymbol **pbuf, + size_t *psize, size_t *plen, DataSymbol c) +{ + size_t len = *plen, size = *psize; + if ((len + 1) > size) { + size = max_size_t(max_size_t(len + 1, 4), + size * 3 / 2); + *pbuf = realloc(*pbuf, sizeof(**pbuf) * size); + *psize = size; + } + (*pbuf)[len++] = c; + *plen = len; +} + +static void gpt2_pp_encode_buf1(WordList *s, + DataSymbol **pout_buf, + size_t *pout_buf_size, size_t *pout_buf_len, + const uint8_t *buf, size_t buf_size) +{ + size_t buf_pos, word_len, len, i; + Word *p; + + for(buf_pos = 0; buf_pos < buf_size; buf_pos += word_len) { + word_len = gpt2_get_word(buf + buf_pos, buf_size - buf_pos); +#if 0 + print_word(buf + buf_pos, word_len); + printf("\n"); +#endif + /* find the longest word(s) */ + for(i = 0; i < word_len; i += len) { + for(len = word_len - i; len >= 1; len--) { + p = word_find_add(s, buf + buf_pos + i, len, FALSE); + if (p) + break; + } + assert(len >= 1); + add_char(pout_buf, pout_buf_size, pout_buf_len, p - s->words); + } + } +} + +size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf, + const uint8_t *buf, size_t buf_size) +{ + size_t out_buf_len, out_buf_size; + DataSymbol *out_buf; + + out_buf_len = 0; + out_buf_size = 0; + out_buf = NULL; + gpt2_pp_encode_buf1(s, &out_buf, &out_buf_size, &out_buf_len, + buf, buf_size); + *pout_buf = out_buf; + return out_buf_len; +} + +void gpt2_pp_decode(const char *word_filename, + const char *in_filename, const char *out_filename) +{ + WordList *s; + FILE *f, *fo; + uint16_t c; + Word *p; + + s = word_list_init(); + word_load(s, word_filename); + + f = fopen(in_filename, "rb"); + if (!f) { + perror(in_filename); + exit(1); + } + + fo = fopen(out_filename, "wb"); + if (!fo) { + perror(out_filename); + exit(1); + } + + for(;;) { + if (fget_be16(f, &c)) + break; + if (c >= s->word_count) { + fprintf(stderr, "Invalid symbol: %d\n", c); + exit(1); + } + p = &s->words[c]; + fwrite(p->buf, 1, p->len, fo); + } + + fclose(fo); + + fclose(f); + + word_list_end(s); +} + +static struct option options[] = { + { NULL }, +}; + +/****************************************************************/ +/* text completion */ + +static int get_random_symb_topk(float *prob, size_t n_symb, int topk, + float topp, RNDState *rnd_state) +{ + NCTopKEntry *tab; + int i, c, k; + float p; + double sum; + + assert(n_symb >= 1); + + prof_start(PROF_WRITE_SYM); + k = nc_topk(&tab, &sum, prob, n_symb, topk, topp); + prof_end(PROF_WRITE_SYM); + + p = rnd_unif(rnd_state) * sum; + + sum = 0; + for(i = 0; i < k - 1; i++) { + sum += prob[tab[i].idx]; + if (p < sum) + break; + } + c = tab[i].idx; + nc_free(tab); + return c; +} + +static void dump_pred_symb(float *prob, size_t n_symb, int k, + WordList *wl) +{ +#if 0 + int *tab, i, c; + Word *wp; + + assert(n_symb >= 1); + tab = malloc(sizeof(tab[0]) * n_symb); + for(i = 0; i < n_symb; i++) + tab[i] = i; + topk_sort(tab, n_symb, prob); + + k = min_int(n_symb, k); + for(i = 0; i < k; i++) { + c = tab[i]; + printf("%d: %10.3g '", i, prob[c]); + wp = &wl->words[c]; + fwrite(wp->buf, 1, wp->len, stdout); + printf("'\n"); + } + free(tab); +#endif +} + +char *trim_text(const char *str) +{ + size_t len; + char *new_str; + while (*str == ' ') + str++; + len = strlen(str); + while (len > 0 && str[len - 1] == ' ') + len--; + new_str = malloc(len + 1); + memcpy(new_str, str, len + 1); + return new_str; +} + +TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model, + const char *filename) +{ + WordList *wl; + TransformerModelParams p_s, *p = &p_s; + TransformerModel *s; + TextCompleteGlobalState *tcs; + char coefs_filename[128]; + + tcs = nc_mallocz(sizeof(*tcs)); + + trf_set_params(p, model); + if (!filename) { + snprintf(coefs_filename, sizeof(coefs_filename), + "gpt2_%s.bin", gpt2_model_name[model]); + filename = coefs_filename; + } + s = trf_init(p, filename); + + wl = word_list_init(); + word_load(wl, "gpt2vocab.txt"); + tcs->wl = wl; + tcs->trf_state = s; + return tcs; +} + +void text_complete_global_end(TextCompleteGlobalState *tcs) +{ + trf_end(tcs->trf_state); + word_list_end(tcs->wl); + nc_free(tcs); +} + +TextGenContext *text_complete_start(TextCompleteGlobalState *tcs, + const char *input_text, + int top_k, float top_p, float temperature, + int seed, int max_output_len) +{ + TransformerModel *s = tcs->trf_state; + WordList *wl = tcs->wl; + TextGenContext *ts; + int i, mem_len; + + ts = nc_mallocz(sizeof(*ts)); + ts->global_state = tcs; + ts->top_k = top_k; + ts->top_p = top_p; + ts->temperature = temperature; + rnd_init(&ts->rnd_state, seed); + ts->max_output_len = max_output_len; + ts->input_buf_len = gpt2_pp_encode_buf(wl, &ts->input_buf, + (const uint8_t *)input_text, + strlen(input_text)); + if (ts->input_buf_len > MAX_INITIAL_TEXT_LEN) { + memmove(ts->input_buf, ts->input_buf + ts->input_buf_len - MAX_INITIAL_TEXT_LEN, MAX_INITIAL_TEXT_LEN * sizeof(ts->input_buf[0])); + ts->input_buf_len = MAX_INITIAL_TEXT_LEN; + ts->input_buf = realloc(ts->input_buf, + ts->input_buf_len * sizeof(ts->input_buf[0])); + } + +#if 0 + for(i = 0; i < ts->input_buf_len; i++) { + printf(" %04x", ts->input_buf[i]); + } + printf("\n"); +#endif + + ts->mem_k = nc_mallocz(sizeof(ts->mem_k[0]) * s->n_layer); + ts->mem_v = nc_mallocz(sizeof(ts->mem_v[0]) * s->n_layer); + mem_len = ts->input_buf_len + max_output_len; + for(i = 0; i < s->n_layer; i++) { + ts->mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_key, mem_len, s->n_head); + nc_tensor_set_name(ts->mem_k[i], "mem_k_%d", i); + ts->mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_value, mem_len, s->n_head); + nc_tensor_set_name(ts->mem_v[i], "mem_v_%d", i); + } + ts->text_len = ts->input_buf_len; + ts->is_first = TRUE; + return ts; +} + +static void text_complete_symb(TextCompleteGlobalState *tcs, + TextGenContext *ts, NCTensor *logits) +{ + TransformerModel *s = tcs->trf_state; + WordList *wl = tcs->wl; + Word *wp; + NCTensorData xbuf, *x; + int c, out_len; + NCTensor *t0; + + t0 = logits; + if (ts->temperature != 1.0) + t0 = nc_mul(t0, nc_new_f32(s->device, 1.0f / ts->temperature)); + t0 = nc_soft_max(t0); + x = nc_tensor_get_data(&xbuf, t0); + + if (0) { + printf("\n"); + dump_pred_symb((float *)x->data, s->n_symbols, 10, wl); + } + c = get_random_symb_topk((float *)x->data, + s->n_symbols, ts->top_k, ts->top_p, + &ts->rnd_state); + if (c == SYMB_EOT) { + ts->out_text_len = 0; + ts->out_text[0] = '\0'; + } else { + wp = &wl->words[c]; + out_len = min_int(sizeof(ts->out_text) - 1, wp->len); + memcpy(ts->out_text, wp->buf, out_len); + ts->out_text[out_len] = '\0'; + ts->out_text_len = out_len; + } + ts->last_c = c; + + nc_free_tensor(t0); +} + +/* Note: ts_list is emptied */ +void text_complete_next(TextCompleteGlobalState *tcs, + struct list_head *ts_list) +{ + TransformerModel *s = tcs->trf_state; + int i, k; + NCTensor *output, *input; + int32_t *ptr; + struct list_head *el, *el1; + TextGenContext *ts, **ts_tab; + int batch_size; + BatchEntry tab_mem[BATCH_SIZE_MAX]; + + list_for_each_safe(el, el1, ts_list) { + ts = list_entry(el, TextGenContext, link); + if (ts->text_len >= s->n_ctx || + (ts->text_len - ts->input_buf_len) >= ts->max_output_len) { + ts->out_text_len = 0; + ts->out_text[0] = '\0'; + list_del(&ts->link); + } else if (ts->is_first) { + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, ts->text_len); + ptr = nc_tensor_get_ptr(input, NULL); + for(i = 0; i < ts->text_len; i++) { + ptr[i] = ts->input_buf[i]; + } + + prof_start(PROF_EVAL); + tab_mem[0].mem_len = 0; + tab_mem[0].mem_k = ts->mem_k; + tab_mem[0].mem_v = ts->mem_v; + output = trf_eval(s, ts->text_len, 1, tab_mem, input); + prof_end(PROF_EVAL); + + text_complete_symb(tcs, ts, nc_slice_alias(output, 1, ts->text_len - 1, ts->text_len)); + nc_free_tensor(output); + + ts->text_len++; + ts->is_first = FALSE; + list_del(&ts->link); + } + } + + ts_tab = nc_mallocz(sizeof(ts_tab[0]) * BATCH_SIZE_MAX); + for(;;) { + k = 0; + list_for_each_safe(el, el1, ts_list) { + ts = list_entry(el, TextGenContext, link); + ts_tab[k++] = ts; + list_del(&ts->link); + if (k >= BATCH_SIZE_MAX) + break; + } + if (k == 0) + break; + batch_size = k; + // printf("batch_size=%d\n", k); + + for(k = 0; k < batch_size; k++) { + ts = ts_tab[k]; + tab_mem[k].mem_len = ts->text_len - 1; + tab_mem[k].mem_k = ts->mem_k; + tab_mem[k].mem_v = ts->mem_v; + } + + /* compute the next probabilities */ + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, batch_size); + ptr = nc_tensor_get_ptr(input, NULL); + for(k = 0; k < batch_size; k++) { + ts = ts_tab[k]; + ptr[k] = ts->last_c; + } + + prof_start(PROF_EVAL); + output = trf_eval(s, 1, batch_size, tab_mem, input); + prof_end(PROF_EVAL); + + for(k = 0; k < batch_size; k++) { + ts = ts_tab[k]; + text_complete_symb(tcs, ts, + nc_slice_alias(output, 1, k, k + 1)); + + ts->text_len++; + ts->is_first = FALSE; + } + nc_free_tensor(output); + } + nc_free(ts_tab); +} + +void text_complete_end(TextGenContext *ts) +{ + TransformerModel *s = ts->global_state->trf_state; + int i; + + for(i = 0; i < s->n_layer; i++) { + nc_free_tensor(ts->mem_k[i]); + nc_free_tensor(ts->mem_v[i]); + } + nc_free(ts->mem_k); + nc_free(ts->mem_v); + + free(ts->input_buf); + nc_free(ts); +} + +void text_complete(GPT2ModelEnum model, const char *model_filename, + const char *input_text, + int top_k, float top_p, float temperature, + int max_output_len, int batch_size, int seed, + BOOL verbose) +{ + TextCompleteGlobalState *tcs; + TextGenContext *ts; + int count; + struct timeval tv; + const char *input_text1; + struct list_head ts_list; + int64_t ti; + + tcs = text_complete_global_init(model, model_filename); + + if (seed == 0) { + gettimeofday(&tv, NULL); + seed = tv.tv_sec + tv.tv_usec; + } + + input_text1 = trim_text(input_text); + if (input_text1[0] == '\0') + input_text1 = strdup(" "); + printf("%s", input_text1); + fflush(stdout); + prof_start(PROF_TOTAL); + if (batch_size == 0) { + ts = text_complete_start(tcs, input_text1, top_k, top_p, temperature, + seed, max_output_len); + + ti = get_time_ms(); + count = 0; + for(;;) { + init_list_head(&ts_list); + list_add_tail(&ts->link, &ts_list); + text_complete_next(tcs, &ts_list); + if (ts->out_text_len == 0) + break; + fwrite(ts->out_text, 1, ts->out_text_len, stdout); + fflush(stdout); + count++; + } + printf("\n"); + text_complete_end(ts); + } else { + TextGenContext **ts_tab; + int i; + + /* test for batch processing (the same text is generated by + each job) */ + + ts_tab = nc_mallocz(sizeof(ts_tab[0]) * batch_size); + + for(i = 0; i < batch_size; i++) { + ts = text_complete_start(tcs, input_text1, top_k, top_p, + temperature, seed, max_output_len); + ts_tab[i] = ts; + } + + ti = get_time_ms(); + count = 0; + for(;;) { + init_list_head(&ts_list); + for(i = 0; i < batch_size; i++) { + ts = ts_tab[i]; + if (ts->is_first || ts->out_text_len > 0) { + list_add_tail(&ts->link, &ts_list); + } + } + if (list_empty(&ts_list)) + break; + text_complete_next(tcs, &ts_list); + + for(i = 0; i < batch_size; i++) { + ts = ts_tab[i]; + if (ts->out_text_len > 0 && i == 0) { + fwrite(ts->out_text, 1, ts->out_text_len, stdout); + fflush(stdout); + } + } + count++; + } + printf("\n"); + + for(i = 0; i < batch_size; i++) { + ts = ts_tab[i]; + text_complete_end(ts); + } + nc_free(ts_tab); + } + ti = get_time_ms() - ti; + if (verbose) { + printf("time=%0.1f word/s\n", + (double)count / ti * 1000); + } + prof_end(PROF_TOTAL); + text_complete_global_end(tcs); + + nc_prof_dump(); +} + +/******************************************************************/ +/* short text compression */ + +/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes + are output. */ +int unicode_to_utf8(uint8_t *buf, unsigned int c) +{ + uint8_t *q = buf; + + if (c < 0x80) { + *q++ = c; + } else { + if (c < 0x800) { + *q++ = (c >> 6) | 0xc0; + } else { + if (c < 0x10000) { + *q++ = (c >> 12) | 0xe0; + } else { + if (c < 0x00200000) { + *q++ = (c >> 18) | 0xf0; + } else { + if (c < 0x04000000) { + *q++ = (c >> 24) | 0xf8; + } else if (c < 0x80000000) { + *q++ = (c >> 30) | 0xfc; + *q++ = ((c >> 24) & 0x3f) | 0x80; + } else { + return 0; + } + *q++ = ((c >> 18) & 0x3f) | 0x80; + } + *q++ = ((c >> 12) & 0x3f) | 0x80; + } + *q++ = ((c >> 6) & 0x3f) | 0x80; + } + *q++ = (c & 0x3f) | 0x80; + } + return q - buf; +} + +static const unsigned int utf8_min_code[5] = { + 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, +}; + +static const unsigned char utf8_first_code_mask[5] = { + 0x1f, 0xf, 0x7, 0x3, 0x1, +}; + +/* return -1 if error. *pp is not updated in this case. max_len must + be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */ +int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp) +{ + int l, c, b, i; + + c = *p++; + if (c < 0x80) { + *pp = p; + return c; + } + switch(c) { + case 0xc0 ... 0xdf: + l = 1; + break; + case 0xe0 ... 0xef: + l = 2; + break; + case 0xf0 ... 0xf7: + l = 3; + break; + case 0xf8 ... 0xfb: + l = 4; + break; + case 0xfc ... 0xfd: + l = 5; + break; + default: + return -1; + } + /* check that we have enough characters */ + if (l > (max_len - 1)) + return -1; + c &= utf8_first_code_mask[l - 1]; + for(i = 0; i < l; i++) { + b = *p++; + if (b < 0x80 || b >= 0xc0) + return -1; + c = (c << 6) | (b & 0x3f); + } + if (c < utf8_min_code[l - 1]) + return -1; + *pp = p; + return c; +} + +static inline int simple_get_bit(const uint8_t *data, size_t index) +{ + return (data[index >> 3] >> (7 - (index & 7))) & 1; +} + +static inline void simple_put_bit(uint8_t *data, size_t index, int bit) +{ + data[index >> 3] |= bit << (7 - (index & 7)); +} + +static uint16_t ranges[3][2] = { + { 0x3400, 0x4DB5 }, + { 0x4e00, 0x9fcf }, + { 0xAC00, 0xD7A3 }, +}; + +static int c15_to_unicode(int c) +{ + int i, n, count; + for(i = 0; i < countof(ranges); i++) { + count = ranges[i][1] - ranges[i][0] + 1; + n = count; + if (c < n) { + return ranges[i][0] + c; + } + c -= count; + } + return -1; +} + +static int unicode_to_c15(int c) +{ + int i, b; + b = 0; + for(i = 0; i < countof(ranges); i++) { + if (c >= ranges[i][0] && c <= ranges[i][1]) + return b + c - ranges[i][0]; + b += ranges[i][1] - ranges[i][0] + 1; + } + return -1; +} + +size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits) +{ + size_t idx, out_buf_len; + int c, i, l, len; + char buf1[8], *out_buf; + + out_buf = malloc(4 * ((n_bits + 14) / 15) + 1); + out_buf_len = 0; + for(idx = 0; idx < n_bits; idx += 15) { + l = min_size_t(15, n_bits - idx); + c = 0; + for(i = 0; i < l; i++) { + c |= simple_get_bit(buf, idx + i) << (14 - i); + } + c = c15_to_unicode(c); + len = unicode_to_utf8((uint8_t *)buf1, c); + memcpy(out_buf + out_buf_len, buf1, len); + out_buf_len += len; + } + out_buf[out_buf_len] = '\0'; + *pout_buf = out_buf; + return out_buf_len; +} + +/* return -1 if error */ +ssize_t convert_from_chars(uint8_t **pout_buf, const char *str) +{ + const char *str_end; + int c, i; + uint8_t *out_buf; + size_t str_len, len; + + str_len = strlen(str); + str_end = str + str_len; + /* Note: the exact length of out_buf is smaller */ + out_buf = malloc(str_len); + memset(out_buf, 0, str_len); + + len = 0; + while (*str != '\0') { + c = unicode_from_utf8((uint8_t *)str, str_end - str, (const uint8_t **)&str); + if (c < 0) + goto fail; + c = unicode_to_c15(c); + if (c < 0 || c >= 32768) + goto fail; + for(i = 0; i < 15; i++) { + simple_put_bit(out_buf, len * 15 + i, (c >> (14 - i)) & 1); + } + len++; + } + *pout_buf = out_buf; + return (len * 15 + 7) / 8; + fail: + free(out_buf); + return -1; +} + +#define LENGTH_K 2 + +int encode_length(PutBitState *pb, uint32_t val) +{ + uint32_t n, a, b, i; + a = val; + n = 1; + for(;;) { + b = 1 << (LENGTH_K * n); + if (a < b) + break; + n++; + a -= b; + } + for(i = 0; i < n - 1; i++) + put_bit_raw(pb, 0); + put_bit_raw(pb, 1); + for(i = 0; i < (LENGTH_K * n); i++) { + put_bit_raw(pb, (a >> (LENGTH_K * n - 1 - i)) & 1); + } + return n + LENGTH_K * n; +} + +int decode_length(GetBitState *gb) +{ + int n, val, a, i; + n = 1; + a = 0; + for(;;) { + if (get_bit_raw(gb)) + break; + if (n >= 10) /* arbitrary limit */ + return -1; + a += 1 << (LENGTH_K * n); + n++; + } + val = 0; + for(i = 0; i < (LENGTH_K * n); i++) { + val |= get_bit_raw(gb) << (LENGTH_K * n - 1 - i); + } + return val + a; +} + +static void realloc_buf(char **pbuf, + size_t *psize, size_t len) +{ + size_t size = *psize; + if (len > size) { + size = max_size_t(len, size * 3 / 2); + *pbuf = realloc(*pbuf, sizeof(**pbuf) * size); + *psize = size; + } +} + + +#define CTEXT_LEN_MAX 256 + +int text_decompress(TextCompleteGlobalState *tcs, + char **poutput_text, const char *input_text) +{ + TransformerModel *s = tcs->trf_state; + WordList *wl = tcs->wl; + uint8_t *data_buf; + ssize_t data_buf_len, text_len, mem_len; + GetBitState gb_s, *gb = &gb_s; + BatchEntry tab_mem[1]; + NCTensor **mem_k, **mem_v; + DataSymbol *text_buf; + NCTensorData xbuf, *x; + int c, i; + char *out_str; + size_t out_str_len, out_str_size; + + *poutput_text = NULL; + + /* XXX: handle zero length ? */ + data_buf_len = convert_from_chars(&data_buf, input_text); + if (data_buf_len < 0) + return -1; + if (data_buf_len == 0) { + *poutput_text = strdup(""); + free(data_buf); + return 0; + } +#if 0 + { + int i; + printf("data_buf="); + for(i = 0; i < data_buf_len; i++) + printf(" %02x", data_buf[i]); + printf("\n"); + } +#endif + get_bit_init(gb, data_buf, data_buf_len, NULL, NULL); + + text_len = decode_length(gb); + if (text_len < 0 || text_len > CTEXT_LEN_MAX) { + free(data_buf); + return -1; + } + text_len++; + + text_buf = nc_malloc(sizeof(text_buf[0]) * text_len); + + mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer); + mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer); + mem_len = text_len; + for(i = 0; i < s->n_layer; i++) { + mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_key, mem_len, s->n_head); + nc_tensor_set_name(mem_k[i], "mem_k_%d", i); + mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_value, mem_len, s->n_head); + nc_tensor_set_name(mem_v[i], "mem_v_%d", i); + } + tab_mem[0].mem_k = mem_k; + tab_mem[0].mem_v = mem_v; + + text_buf[0] = SYMB_EOT; + + for(i = 0; i < text_len - 1; i++) { + NCTensor *t0, *input; + int32_t *ptr; + + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, 1); + ptr = nc_tensor_get_ptr(input, NULL); + ptr[0] = text_buf[i]; + tab_mem[0].mem_len = i; + t0 = trf_eval(s, 1, 1, tab_mem, input); + + t0 = nc_soft_max(t0); + x = nc_tensor_get_data(&xbuf, t0); + c = read_sym(gb, (float *)x->data, x->dims[0]); + text_buf[i + 1] = c; + nc_free_tensor(t0); + } + + /* convert back to a string */ + out_str = NULL; + out_str_len = 0; + out_str_size = 0; + for(i = 1; i < text_len; i++) { + Word *wp; + wp = &wl->words[text_buf[i]]; + realloc_buf(&out_str, &out_str_size, out_str_len + wp->len); + memcpy(out_str + out_str_len, wp->buf, wp->len); + out_str_len += wp->len; + } + realloc_buf(&out_str, &out_str_size, out_str_len + 1); + out_str[out_str_len] = '\0'; + + for(i = 0; i < s->n_layer; i++) { + nc_free_tensor(mem_k[i]); + nc_free_tensor(mem_v[i]); + } + nc_free(mem_k); + nc_free(mem_v); + nc_free(text_buf); + free(data_buf); + + *poutput_text = out_str; + + return 0; +} + +#define TEXT_OUTPUT_BUF_LEN 4096 + +static void text_arith_write_buf(void *opaque, const uint8_t *buf, size_t buf_size) +{ + /* we assume the output is small enough to fit the buffer */ +} + +int text_compress(TextCompleteGlobalState *tcs, + char **poutput_text, + const char *input_text, BOOL dump_stats) +{ + TransformerModel *s = tcs->trf_state; + DataSymbol *input_buf; + int i, mem_len; + NCTensorData xbuf, *x; + double n_bits; + BatchEntry tab_mem[1]; + NCTensor **mem_k, **mem_v, *output, *input; + PutBitState pb_s, *pb = &pb_s; + size_t input_buf_len, input_buf_size, out_buf_len; + uint8_t *out_buf; + char *out_str; + int32_t *ptr; + + *poutput_text = NULL; + + input_buf = NULL; + input_buf_size = 0; + input_buf_len = 0; + + add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT); + gpt2_pp_encode_buf1(tcs->wl, &input_buf, &input_buf_size, &input_buf_len, + (const uint8_t *)input_text, + strlen(input_text)); + if (input_buf_len > CTEXT_LEN_MAX) { + free(input_buf); + return -1; + } + if (input_buf_len == 1) { + free(input_buf); + *poutput_text = strdup(""); + return 0; + } + +#if 0 + for(i = 0; i < input_buf_len; i++) { + printf(" %04x", input_buf[i]); + } + printf("\n"); +#endif + prof_start(PROF_EVAL); + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, input_buf_len); + ptr = nc_tensor_get_ptr(input, NULL); + for(i = 0; i < input_buf_len; i++) { + ptr[i] = input_buf[i]; + } + + mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer); + mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer); + mem_len = input_buf_len; + for(i = 0; i < s->n_layer; i++) { + mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_key, mem_len, s->n_head); + nc_tensor_set_name(mem_k[i], "mem_k_%d", i); + mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_value, mem_len, s->n_head); + nc_tensor_set_name(mem_v[i], "mem_v_%d", i); + } + tab_mem[0].mem_len = 0; + tab_mem[0].mem_k = mem_k; + tab_mem[0].mem_v = mem_v; + + output = trf_eval(s, input_buf_len, 1, tab_mem, input); + prof_end(PROF_EVAL); + + out_buf = malloc(TEXT_OUTPUT_BUF_LEN); + put_bit_init(pb, out_buf, TEXT_OUTPUT_BUF_LEN, text_arith_write_buf, NULL); + + n_bits = encode_length(pb, input_buf_len - 1); + + for(i = 0; i < input_buf_len - 1; i++) { + double v; + NCTensor *t0; + t0 = nc_soft_max(nc_slice_alias(output, 1, i, i + 1)); + x = nc_tensor_get_data(&xbuf, t0); + write_sym(pb, (float *)x->data, x->dims[0], input_buf[i + 1]); + v = -log2(((float *)x->data)[input_buf[i + 1]]); + // printf("%d: %0.1f\n", i, v); + nc_free_tensor(t0); + n_bits += v; + } + nc_free_tensor(output); + out_buf_len = put_bit_flush(pb); +#if 0 + { + printf("out_buf="); + for(i = 0; i < (out_buf_len + 7) / 8; i++) + printf(" %02x", out_buf[i]); + printf("\n"); + } +#endif + convert_to_chars(&out_str, out_buf, out_buf_len); + if (dump_stats) { + printf("%d chars, %" PRId64 " symbols, %" PRId64 " bits (ref=%0.1f bits) (%d compressed chars)\n", + (int)strlen(input_text), + (int64_t)input_buf_len, + (int64_t)out_buf_len, + n_bits, + (int)((out_buf_len + 14) / 15)); + } + + free(out_buf); + free(input_buf); + for(i = 0; i < s->n_layer; i++) { + nc_free_tensor(mem_k[i]); + nc_free_tensor(mem_v[i]); + } + nc_free(mem_k); + nc_free(mem_v); + *poutput_text = out_str; + return 0; +} + +void text_compress_test(GPT2ModelEnum model, const char *model_filename, + const char *input_text, + BOOL is_decode, BOOL verbose) +{ + TextCompleteGlobalState *tcs; + char *out_str; + + tcs = text_complete_global_init(model, model_filename); + + if (is_decode) { + if (text_decompress(tcs, &out_str, input_text) < 0) { + printf("Error\n"); + } else { + printf("%s\n", out_str); + } + free(out_str); + } else { + if (text_compress(tcs, &out_str, input_text, verbose) < 0) { + printf("Error\n"); + } else { + printf("%s\n", out_str); + } + free(out_str); + } + text_complete_global_end(tcs); +} + +/*************************************************/ +/* file compression */ + +static uint8_t *load_file(size_t *psize, const char *filename) +{ + FILE *f; + size_t size; + uint8_t *buf; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + fseek(f, 0, SEEK_END); + size = ftell(f); + fseek(f, 0, SEEK_SET); + buf = malloc(size + 1); + if (fread(buf, 1, size, f) != size) { + fprintf(stderr, "%s: I/O error\n", filename); + exit(1); + } + buf[size] = '\0'; + fclose(f); + *psize = size; + return buf; +} + +/* check if CRLF can be converted to LF losslessly */ +static BOOL check_lossless_crlf(const uint8_t *buf, size_t len) +{ + size_t i; + BOOL has_crlf; + has_crlf = FALSE; + for(i = 0; i < len - 1;) { + if (buf[i] == '\r' && buf[i + 1] == '\n') { + has_crlf = TRUE; + i += 2; + } else if (buf[i] == '\n') { + return FALSE; + } else { + i++; + } + } + return has_crlf; +} + +static size_t convert_crlf_to_lf(uint8_t *buf, size_t len) +{ + size_t i, j; + j = 0; + for(i = 0; i < len - 1;) { + if (buf[i] == '\r' && buf[i + 1] == '\n') + i++; + buf[j++] = buf[i++]; + } + if (i < len) + buf[j++] = buf[i++]; + return j; +} + +#define ARITH_BUF_LEN 65536 + +static void arith_write_buf(void *opaque, const uint8_t *buf, size_t buf_size) +{ + FILE *f = opaque; + fwrite(buf, 1, buf_size, f); +} + +/* XXX: should use a large batch size */ +int file_compress(TextCompleteGlobalState *tcs, + const char *infilename, const char *outfilename) +{ + TransformerModel *s = tcs->trf_state; + DataSymbol *input_buf; + int i, mem_len, len; + NCTensorData xbuf, *x; + BatchEntry tab_mem[1]; + NCTensor **mem_k, **mem_v, *output, *input; + PutBitState pb_s, *pb = &pb_s; + size_t input_buf_len, input_buf_size, input_text_len; + int64_t n_output_bits; + size_t input_buf_pos; + uint8_t *input_text, *arith_buf; + FILE *f; + BOOL convert_crlf; + int32_t *ptr; + + input_text = load_file(&input_text_len, infilename); + + convert_crlf = check_lossless_crlf(input_text, input_text_len); + // printf("convert_crlf=%d\n", convert_crlf); + + if (convert_crlf) { + input_text_len = convert_crlf_to_lf(input_text, input_text_len); + } + + input_buf = NULL; + input_buf_size = 0; + input_buf_len = 0; + + add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT); + gpt2_pp_encode_buf1(tcs->wl, &input_buf, &input_buf_size, &input_buf_len, + input_text, input_text_len); + add_char(&input_buf, &input_buf_size, &input_buf_len, SYMB_EOT); + +#if 0 + for(i = 0; i < input_buf_len; i++) { + printf(" %04x", input_buf[i]); + } + printf("\n"); +#endif + prof_start(PROF_EVAL); + mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer); + mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer); + for(i = 0; i < s->n_layer; i++) { + mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_key, s->n_ctx, s->n_head); + nc_tensor_set_name(mem_k[i], "mem_k_%d", i); + mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_value, s->n_ctx, s->n_head); + nc_tensor_set_name(mem_v[i], "mem_v_%d", i); + } + + f = fopen(outfilename, "wb"); + if (!f) { + perror(outfilename); + exit(1); + } + + arith_buf = nc_malloc(ARITH_BUF_LEN); + put_bit_init(pb, arith_buf, ARITH_BUF_LEN, arith_write_buf, f); + + put_bit_raw(pb, convert_crlf); + + mem_len = 0; + input_buf_pos = 0; + while (input_buf_pos < (input_buf_len - 1)) { + len = min_size_t(input_buf_len - 1 - input_buf_pos, s->n_ctx - mem_len); + printf("%5.1f%% \r", (double)input_buf_pos / (double)input_buf_len * 100); + fflush(stdout); + // printf("pos=%d mem_len=%d len=%d\n", (int)input_buf_pos, mem_len, len); + + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, mem_len + len); + ptr = nc_tensor_get_ptr(input, NULL); + for(i = 0; i < mem_len + len; i++) { + ptr[i] = input_buf[input_buf_pos - mem_len + i]; + } + tab_mem[0].mem_len = 0; + tab_mem[0].mem_k = mem_k; + tab_mem[0].mem_v = mem_v; + + output = trf_eval(s, mem_len + len, 1, tab_mem, input); + + for(i = 0; i < len; i++) { + NCTensor *t0; + t0 = nc_soft_max(nc_slice_alias(output, 1, mem_len + i, + mem_len + i + 1)); + x = nc_tensor_get_data(&xbuf, t0); + write_sym(pb, (float *)x->data, + x->dims[0], input_buf[input_buf_pos + i + 1]); + nc_free_tensor(t0); + } + nc_free_tensor(output); + + input_buf_pos += len; + mem_len = min_int(mem_len + len, s->n_ctx / 2); + } + + prof_end(PROF_EVAL); + + n_output_bits = put_bit_flush(pb); + + printf("-> %" PRId64 " bytes\n", (n_output_bits + 7) / 8); + fclose(f); + nc_free(arith_buf); + + free(input_buf); + for(i = 0; i < s->n_layer; i++) { + nc_free_tensor(mem_k[i]); + nc_free_tensor(mem_v[i]); + } + nc_free(mem_k); + nc_free(mem_v); + return 0; +} + +int file_decompress(TextCompleteGlobalState *tcs, + const char *infilename, const char *outfilename) +{ + TransformerModel *s = tcs->trf_state; + WordList *wl = tcs->wl; + uint8_t *data_buf; + ssize_t data_buf_len; + GetBitState gb_s, *gb = &gb_s; + BatchEntry tab_mem[1]; + NCTensor **mem_k, **mem_v, *input, *t0; + DataSymbol *text_buf; + NCTensorData xbuf, *x; + Word *wp; + int c, i, pos; + FILE *f; + BOOL convert_crlf; + int32_t *ptr; + + data_buf = load_file((size_t *)&data_buf_len, infilename); +#if 0 + { + int i; + printf("data_buf="); + for(i = 0; i < data_buf_len; i++) + printf(" %02x", data_buf[i]); + printf("\n"); + } +#endif + get_bit_init(gb, data_buf, data_buf_len, NULL, NULL); + + convert_crlf = get_bit_raw(gb); + + text_buf = nc_malloc(sizeof(text_buf[0]) * s->n_ctx); + + mem_k = nc_mallocz(sizeof(mem_k[0]) * s->n_layer); + mem_v = nc_mallocz(sizeof(mem_v[0]) * s->n_layer); + for(i = 0; i < s->n_layer; i++) { + mem_k[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_key, s->n_ctx, s->n_head); + nc_tensor_set_name(mem_k[i], "mem_k_%d", i); + mem_v[i] = nc_new_tensor_3d(s->device, NC_TYPE_F32, + s->d_value, s->n_ctx, s->n_head); + nc_tensor_set_name(mem_v[i], "mem_v_%d", i); + } + tab_mem[0].mem_k = mem_k; + tab_mem[0].mem_v = mem_v; + + text_buf[0] = SYMB_EOT; + + f = fopen(outfilename, "wb"); + if (!f) + perror(outfilename); + + pos = 0; + for(;;) { + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, 1); + ptr = nc_tensor_get_ptr(input, NULL); + ptr[0] = text_buf[pos]; + tab_mem[0].mem_len = pos; + t0 = trf_eval(s, 1, 1, tab_mem, input); + t0 = nc_soft_max(t0); + x = nc_tensor_get_data(&xbuf, t0); + c = read_sym(gb, (float *)x->data, x->dims[0]); + nc_free_tensor(t0); + if (c == SYMB_EOT) + break; + wp = &wl->words[c]; + if (convert_crlf) { + for(i = 0; i < wp->len; i++) { + if (wp->buf[i] == '\n') + fputc('\r', f); + fputc(wp->buf[i], f); + } + } else { + fwrite(wp->buf, 1, wp->len, f); + } + fflush(f); + pos++; + if (pos >= s->n_ctx) { + int n; + /* buffer full: restart with the last n_ctx / 2 symbols */ + n = s->n_ctx / 2; + for(i = 0; i < n; i++) + text_buf[i] = text_buf[pos - n + i]; + + input = nc_new_tensor_1d(s->device, NC_TYPE_I32, n); + ptr = nc_tensor_get_ptr(input, NULL); + for(i = 0; i < n; i++) + ptr[i] = text_buf[i]; + tab_mem[0].mem_len = 0; + t0 = trf_eval(s, n, 1, tab_mem, input); + nc_free_tensor(t0); + pos = n; + } + text_buf[pos] = c; + } + + fclose(f); + + for(i = 0; i < s->n_layer; i++) { + nc_free_tensor(mem_k[i]); + nc_free_tensor(mem_v[i]); + } + nc_free(mem_k); + nc_free(mem_v); + nc_free(text_buf); + free(data_buf); + + return 0; +} diff --git a/gpt2/gpt2tc.h b/gpt2/gpt2tc.h new file mode 100644 index 0000000..a110569 --- /dev/null +++ b/gpt2/gpt2tc.h @@ -0,0 +1,143 @@ +#ifndef _GPT2TC_H +#define _GPT2TC_H +#ifdef __cplusplus +extern "C" { +#endif +#include +#include +#include +#include + +#include "cutils.h" +#include "arith.h" +#include "cp_utils.h" +#include "list.h" +#include "libnc.h" + +#define MAX_INITIAL_TEXT_LEN 256 /* in symbols */ +#define MAX_OUTPUT_LEN 100 +#define DEFAULT_TOP_K 40 +#define DEFAULT_TOP_P 0.9 +#define BATCH_SIZE_MAX 16 +//#define BATCH_SIZE_MAX 1 + + +typedef uint16_t DataSymbol; + +typedef enum { + GPT2_MODEL_117M, + GPT2_MODEL_345M, + GPT2_MODEL_774M, + GPT2_MODEL_1558M, +} GPT2ModelEnum; + +typedef struct { + BOOL is_decoder; + int n_layer; + int d_model; + int n_head; + int d_key; + int d_value; + int d_inner; + int n_ctx; + int n_symbols; + uint32_t seed; +} TransformerModelParams; + +typedef struct { + NCTensor *ln_1_g, *ln_1_b; + NCTensor *attn_w, *attn_b; + NCTensor *attn_proj_w, *attn_proj_b; + + NCTensor *ln_2_g, *ln_2_b; + NCTensor *mlp_fc_w, *mlp_fc_b; + NCTensor *mlp_proj_w, *mlp_proj_b; +} TransformerLayer; + +typedef struct { + RNDState rnd_state; + NCContext *model; + NCDevice *device; + int n_layer; + int d_model; + int n_head; + int d_key; + int d_value; + int d_inner; + int n_symbols; + int n_ctx; + + /* parameters */ + NCParamList param_list; + TransformerLayer *layers; + NCTensor *wte, *wpe, *wte_trans; + NCTensor *ln_f_g, *ln_f_b; +} TransformerModel; + +typedef struct Word { + uint32_t next; /* -1 = end */ + uint32_t len; + uint8_t *buf; +} Word; + +typedef struct { + Word *words; + size_t word_count; + size_t word_size; + uint32_t *hash_table; + int hash_size; + int hash_bits; +} WordList; + +typedef struct { + TransformerModel *trf_state; + WordList *wl; +} TextCompleteGlobalState; + +typedef struct { + struct list_head link; + TextCompleteGlobalState *global_state; + int top_k; + float top_p; + float temperature; + RNDState rnd_state; + NCTensor **mem_k, **mem_v; + DataSymbol *input_buf; + int input_buf_len; + int text_len; /* current input text len */ + BOOL is_first; + int last_c; + int max_output_len; + + /* output */ + char out_text[1024]; + int out_text_len; /* 0 means end of output */ +} TextGenContext; + +GPT2ModelEnum parse_model(const char *str); +void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model); +void gpt2_pp_encode(const char *word_filename, const char *in_filename, const char *out_filename); +size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf, const uint8_t *buf, size_t buf_size); +void gpt2_pp_decode(const char *word_filename, const char *in_filename, const char *out_filename); +char *trim_text(const char *str); +TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model, const char *filename); +void text_complete_global_end(TextCompleteGlobalState *tcs); +TextGenContext *text_complete_start(TextCompleteGlobalState *tcs, const char *input_text, int top_k, float top_p, float temperature, int seed, int max_output_len); +void text_complete_next(TextCompleteGlobalState *tcs, struct list_head *ts_list); +void text_complete_end(TextGenContext *ts); +void text_complete(GPT2ModelEnum model, const char *model_filename, const char *input_text, int top_k, float top_p, float temperature, int max_output_len, int batch_size, int seed, BOOL verbose); +int unicode_to_utf8(uint8_t *buf, unsigned int c); +int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits); +ssize_t convert_from_chars(uint8_t **pout_buf, const char *str); +int encode_length(PutBitState *pb, uint32_t val); +int decode_length(GetBitState *gb); +int text_decompress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text); +int text_compress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text, BOOL dump_stats); +void text_compress_test(GPT2ModelEnum model, const char *model_filename, const char *input_text, BOOL is_decode, BOOL verbose); +int file_compress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename); +int file_decompress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename); +#ifdef __cplusplus +} +#endif +#endif diff --git a/gpt2/gpt2vocab.txt b/gpt2/gpt2vocab.txt new file mode 100644 index 0000000..62c74b2 Binary files /dev/null and b/gpt2/gpt2vocab.txt differ diff --git a/gpt2/libnc.h b/gpt2/libnc.h new file mode 100644 index 0000000..8a478c5 --- /dev/null +++ b/gpt2/libnc.h @@ -0,0 +1,426 @@ +/* + * LibNC + * + * Copyright (c) 2018-2019 Fabrice Bellard + * + */ +#ifndef LIBNC_H +#define LIBNC_H + +#include +#include "cutils.h" +#include "list.h" + +/* profiling */ + +typedef enum { + PROF_EVAL, + PROF_GRAD, + PROF_SGD, + PROF_UPDATE, + PROF_WRITE_SYM, + PROF_PROBE, + PROF_TOTAL, + PROF_COUNT, +} ProfEnum; + +#ifdef PROFILE + +extern int64_t prof_cycles[PROF_COUNT]; +extern int64_t prof_samples[PROF_COUNT]; +extern int64_t prof_ops[PROF_COUNT]; + +static inline void prof_start(int idx) +{ + prof_cycles[idx] -= get_cycles(); +} + +static inline void prof_end(int idx) +{ + prof_cycles[idx] += get_cycles(); + prof_samples[idx]++; +} + +static inline void prof_end_ops(int idx, int n_ops) +{ + prof_cycles[idx] += get_cycles(); + prof_ops[idx] += n_ops; + prof_samples[idx]++; +} + +#else + +static inline void prof_start(int idx) +{ +} + +static inline void prof_end(int idx) +{ +} + +static inline void prof_end_ops(int idx, int n_ops) +{ +} + +#endif + +void nc_prof_dump(void); + +/* Automatic Differentiation Engine */ + +typedef struct NCContext NCContext; +typedef struct NCDevice NCDevice; +typedef struct NCTensor NCTensor; +typedef struct NCTensorBuffer NCTensorBuffer; +typedef struct NCNode NCNode; +typedef struct NCRNDState NCRNDState; +typedef struct NCSGDOptState NCSGDOptState; + +typedef enum { + NC_TYPE_F32, + NC_TYPE_BF16, + NC_TYPE_F16, + NC_TYPE_I8, + NC_TYPE_I16, + NC_TYPE_I32, + NC_TYPE_COUNT, +} NCTypeEnum; + +extern size_t nc_type_size_table[NC_TYPE_COUNT]; +extern const char *nc_type_name_table[NC_TYPE_COUNT]; + +#define NC_N_DIMS_MAX 4 /* maximum number of axis for tensors */ + +typedef struct NCTensorData { + NCTypeEnum item_type; + size_t item_size; + void *data; + size_t stride; /* in elements */ + size_t n_strides; /* prod(j = 1 ... n_dims, dims[j]); */ + int n_dims; + const size_t *dims; /* n_dims length */ + const size_t *strides; /* n_dims length, strides in bytes */ +} NCTensorData; + +void *nc_malloc(size_t size); +void *nc_mallocz(size_t size); +void nc_free(void *ptr); + +NCContext *nc_context_init(int nb_threads); +void nc_context_end(NCContext *m); + +NCDevice *nc_new_cpu_device(NCContext *m); +NCDevice *nc_new_cuda_device(NCContext *m, int device_index); +NCDevice *nc_new_device(NCContext *m, const char *device_name); +void nc_synchronize(NCDevice *d); + +NCTensorBuffer *nc_new_tensor_buffer(NCDevice *d, size_t size); +NCTensorBuffer *nc_dup_tensor_buffer(const NCTensorBuffer *b); +void nc_free_tensor_buffer(NCTensorBuffer *b); + +NCTensor *nc_new_tensor(NCDevice *d, NCTypeEnum type, + int n_dims, const size_t *dims); +NCTensor *nc_new_tensor_from_tensor(const NCTensor *x); +NCTensor *nc_new_tensor_from_tensor_nz(const NCTensor *x); +NCTensor *nc_new_scalar(NCDevice *d, NCTypeEnum type); +NCTensor *nc_new_tensor_1d(NCDevice *d, NCTypeEnum type, size_t len); +NCTensor *nc_new_tensor_2d(NCDevice *d, NCTypeEnum type, size_t n0, size_t n1); +NCTensor *nc_new_tensor_3d(NCDevice *d, NCTypeEnum type, + size_t n0, size_t n1, size_t n2); +NCTensor *nc_new_tensor_4d(NCDevice *d, NCTypeEnum type, + size_t n0, size_t n1, size_t n2, size_t n3); +NCTensor *__attribute__((format(printf, 2, 3))) nc_tensor_set_name(NCTensor *x, const char *fmt, ...); +NCTensor *nc_dup_tensor(const NCTensor *x); +void nc_free_tensor(NCTensor *x); +void nc_dump_tensor(const char *name, NCTensor *x, size_t n); +uint32_t nc_tensor_get_hash(NCTensor *x); +void nc_dump_tensor_hash(const char *name, const NCTensor *x); +NCNode *nc_get_node(NCTensor *x); +/* create an alias to tensor 'x1'. Gradient is not propagated thru it */ +NCTensor *nc_slice_alias(const NCTensor *x1, int axis, size_t start, size_t end); + +NCTypeEnum nc_tensor_get_item_type(const NCTensor *x); +NCTensorData *nc_tensor_get_data(NCTensorData *sd, const NCTensor *x); +/* Return a pointer to the tensor data. If *pstride is non NULL, + return the stride (in elements) of the first dimension. */ +void *nc_tensor_get_ptr(NCTensor *x, size_t *pstride); +const size_t *nc_tensor_get_dims(const NCTensor *x, int *pn_dims); +void nc_tensor_set_zero(NCTensor *y); +void nc_tensor_set_f32(NCTensor *y, float val); +NCRNDState *nc_rnd_init(NCDevice *d, uint32_t seed); +void nc_rnd_end(NCRNDState *s); +void nc_tensor_set_rnd_unif(NCTensor *y, float avg, float range, + NCRNDState *rnd_state); +void nc_tensor_set_dropout(NCTensor *y, float prob, NCRNDState *rnd_state); + +void nc_set1_i32(NCTensor *y, int n_dims, const size_t *tab_indexes, + int32_t val); +void nc_set1_i32_1d(NCTensor *y, size_t i0, int32_t val); +void nc_set1_i32_2d(NCTensor *y, size_t i0, size_t i1, int32_t val); +void nc_set1_f32(NCTensor *y, int n_dims, const size_t *tab_indexes, + float val); +void nc_set1_f32_1d(NCTensor *y, size_t i0, float val); + +int32_t nc_get1_i32(const NCTensor *x, int n_dims, const size_t *tab_indexes); +float nc_get1_f32(const NCTensor *x, int n_dims, const size_t *tab_indexes); +float nc_get1_f32_1d(const NCTensor *x, size_t i0); +float nc_get_scalar_f32(const NCTensor *x); + +void nc_tensor_copy(NCTensor *dst, NCTensor *src); +void nc_tensor_convert(NCTensor *dst, NCTensor *src); + +void nc_dump_dims(const char *str, NCTensor *x); +size_t nc_get_heap_size(NCContext *m); +NCContext *nc_get_tensor_context(const NCTensor *x); +NCTensor *nc_tensor_to_device(NCTensor *x, NCDevice *d); +NCTensor *nc_tensor_to_cpu_device(NCTensor *x); +NCDevice *nc_get_tensor_device(const NCTensor *x); + +/* element wise operations */ +NCTensor *nc_convert(NCTensor *x, NCTypeEnum new_type); +NCTensor *nc_add(NCTensor *x1, NCTensor *x2); +NCTensor *nc_neg(NCTensor *x); +NCTensor *nc_sub(NCTensor *x1, NCTensor *x2); +NCTensor *nc_mul(NCTensor *x1, NCTensor *x2); +NCTensor *nc_div(NCTensor *x1, NCTensor *x2); +NCTensor *nc_recip(NCTensor *x); +NCTensor *nc_min(NCTensor *x1, NCTensor *x2); +NCTensor *nc_max(NCTensor *x1, NCTensor *x2); +/* select x1[i] if z[i] = 0 and x2[i] otherwise */ +NCTensor *nc_select(NCTensor *z, NCTensor *x1, NCTensor *x2); +/* set y[i] = x1[i] if mask[i] = 0 and y[i] = c if mask[i] != 0. If + mask_inv is TRUE, 'mask' is inverted */ +NCTensor *nc_masked_fill(NCTensor *x, NCTensor *mask, float c, BOOL mask_inv); +NCTensor *nc_sigmoid(NCTensor *x); +NCTensor *nc_tanh(NCTensor *x); +NCTensor *nc_relu(NCTensor *x); +NCTensor *nc_gelu(NCTensor *x); +NCTensor *nc_log(NCTensor *x); +/* return cp * fg + min(1 - fg, ig) * in */ +NCTensor *nc_lstm_clamped(NCTensor *cp, NCTensor *in, + NCTensor *fg, NCTensor *ig); +/* return a * (1 - t) + b * t */ +NCTensor *nc_lerp(NCTensor *a, NCTensor *b, NCTensor *t); + +/* other operations */ +NCTensor *nc_new_vec_f32(NCDevice *d, size_t n, float val); +NCTensor *nc_new_f32(NCDevice *d, float val); +NCTensor *nc_reshape(NCTensor *x, int n_dims, const size_t *dims); +NCTensor *nc_reshape_1d(NCTensor *x, size_t n0); +NCTensor *nc_reshape_2d(NCTensor *x, size_t n0, size_t n1); +NCTensor *nc_reshape_3d(NCTensor *x, size_t n0, size_t n1, size_t n2); +NCTensor *nc_reshape_4d(NCTensor *x, size_t n0, size_t n1, size_t n2, + size_t n3); +/* duplicate the tensor by adding n_dims dimensions */ +NCTensor *nc_repeat(NCTensor *x, int n_dims, const size_t *dims); +NCTensor *nc_repeat_1d(NCTensor *x, size_t n); +/* return y0 + sum over the dimensions > n_dims of 'x'. y0 = NULL + is supported */ +NCTensor *nc_reduce_sum(NCTensor *y0, NCTensor *x, int n_dims); +/* sum all the elements of a tensor */ +NCTensor *nc_sum(NCTensor *x); +/* sum of squares */ +NCTensor *nc_reduce_sum_sqr(NCTensor *x); +NCTensor *nc_slice(NCTensor *x, int axis, size_t start, size_t end); +NCTensor *nc_slice_add(NCTensor *y0, NCTensor *x, int axis, size_t start); +/* concatenation along axis 'axis' */ +NCTensor *nc_concat(NCTensor **inputs, int n_inputs, int axis); +/* shortcut for axis = 0 */ +NCTensor *nc_vconcat(NCTensor **inputs, int n_inputs); +/* shortcut for axis = 1 */ +NCTensor *nc_hconcat(NCTensor **inputs, int n_inputs); +/* split along axis 'axis'. If tab_size = NULL, split equally. */ +void nc_split(NCTensor **tab_y, NCTensor *x, int n_outputs, + const size_t *tab_size, int axis); +/* shortcut for axis = 0 */ +void nc_vsplit(NCTensor **tab_y, NCTensor *x, int n_outputs, + const size_t *tab_size); +/* shortcut for axis = 1 */ +void nc_hsplit(NCTensor **tab_y, NCTensor *x, int n_outputs, + const size_t *tab_size); + +typedef enum { + NC_PAD_ZERO, + NC_PAD_DUP, /* duplicate element */ + /* trim types, dual to padding */ + NC_TRIM_NORMAL = NC_PAD_ZERO, + NC_TRIM_SUM, /* add trimmed elements to the edge */ +} NCPadEnum; + +/* pad (len > 0) or trim (len < 0) the axis 0 of 'x' */ +NCTensor *nc_pad(NCTensor *x, ssize_t left_len, NCPadEnum left_op, + ssize_t right_len, NCPadEnum right_op); +/* shortcut to nc_pad() */ +NCTensor *nc_resize(NCTensor *x, size_t n); + +/* if x is not contiguous then create a new contiguous tensor and copy + x to it. Otherwise, return 'x'. */ +NCTensor *nc_make_contiguous(NCTensor *x); +/* Return a new tensor sharing the same buffer as 'x' with the permuted + dimensions. axis[i] is the corresponding axis in 'x' */ +NCTensor *nc_permute_alias(NCTensor *x, int n_dims, const int *axis); +/* same as nc_permute_alias but calls nc_make_contiguous after. */ +NCTensor *nc_permute(NCTensor *x, int n_dims, const int *axis); +/* special case of nc_permute() */ +NCTensor *nc_transpose(NCTensor *x); +NCTensor *nc_matmul(NCTensor *w, NCTensor *x); +/* return w*x + y0. w and x can be optionally transposed. y0 can be NULL */ +NCTensor *nc_matmul_add(NCTensor *w, NCTensor *x, NCTensor *y0, + BOOL w_trans, BOOL x_trans); +NCTensor *nc_matmul_stride(NCTensor *w, NCTensor *x); +/* return a matrix where each column is the column x[i] of matrix 'w' */ +NCTensor *nc_get_col(NCTensor *w, NCTensor *x); +/* add the vectors 'z' at column number 'x' in matrix 'w'. */ +NCTensor *nc_add_col(NCTensor *z, NCTensor *x, NCTensor *w); +/* select the x-th element in each column of 'w' */ +NCTensor *nc_get_element(NCTensor *w, NCTensor *x); +/* add z to the x-th element in each column of 'w' */ +NCTensor *nc_add_element(NCTensor *z, NCTensor *x, NCTensor *w); +NCTensor *nc_soft_max(NCTensor *x); +/* Equivalent to y = log(get_element(x, eout)). It is expected to be + used as nc_index_log(nc_soft_max(x), eout) so that the gradient + computation is optimized. */ +NCTensor *nc_indexed_log(NCTensor *x, NCTensor *eout); +NCTensor *nc_layer_norm(NCTensor *x, float eps); +NCTensor *nc_rms_norm(NCTensor *x, float eps); +NCTensor *nc_slt_mat_set(NCTensor *x, size_t pos, float c); +/* shift the column 'i' by 'pos + i * mult' elements and pad with with zeros */ +NCTensor *nc_rel_shift(NCTensor *x, ssize_t pos, ssize_t mult); + +/* auto differentiation */ + +/* get_col_index is non NULL in the sparse gradient case */ +typedef void NCParamUpdateFunc(void *opaque, NCTensor *grad, + NCTensor *get_col_index); + +/* add a 'parameter' graph node to 'x' and return 'x'. */ +NCTensor *nc_set_param(NCTensor *x, void *opaque); +/* return a new tensor with its graph removed */ +NCTensor *nc_stop_grad(NCTensor *x); + +/* manipulation of graph nodes */ +NCNode *nc_dup_node(const NCNode *n); +void nc_free_node(NCNode *n); +void nc_combine_nodes(NCContext *m, NCNode **tab_op1, int count, + int axis, int elem_size, const size_t *tab_elem_size); +NCNode *nc_concat_node(NCContext *m, NCNode **inputs, int count, + int axis, const size_t *tab_size); +void nc_concat_optimization(NCContext *m, NCNode **concat_nodes, int count); +void nc_node_set_parent(NCNode *n, int arg_index, const NCNode *n1); +void nc_node_set_arg(NCNode *n, int arg_index, const NCTensor *x); + +#define NC_BW_KEEP_GRAD_GRAPH (1 << 0) +/* optimize the nc_get_col() gradient */ +#define NC_BW_SPARSE_GRAD (1 << 1) + +void nc_backward(const NCTensor *x, NCTensor *grad, + NCParamUpdateFunc *param_update_func, int flags); +void nc_dump_graph(NCTensor *x); + +/* utilities for function parameters */ + +typedef struct { + struct list_head link; + NCTensor **pval; /* pointer to the tensor location */ + char *name; /* parameter name */ + NCTensor *low_part; /* if BF16 parameter, additional 16 bit precision */ + NCTensor *saved_grad; /* debug */ + /* SGD opt data */ + struct SGDOptVarState *sgd_opt; +} NCParam; + +typedef struct { + struct list_head param_list; + BOOL add_graph; +} NCParamList; + +void nc_param_list_init(NCParamList *pl); +void nc_param_list_set_graph(NCParamList *pl, BOOL add_graph); +NCParam *nc_new_param_str(NCParamList *pl, NCTensor **pval, const char *str); +__attribute__((format(printf, 3, 4))) NCParam *nc_new_param(NCParamList *pl, NCTensor **pval, const char *fmt, ...); +void nc_param_list_end(NCParamList *pl); + +NCParam *nc_find_param(NCParamList *pl, const char *name); +size_t nc_get_param_count(NCParamList *pl); +void nc_save_coefs(NCParamList *pl, const char *filename); +void nc_load_coefs(NCParamList *pl, const char *filename); +void nc_save_state(NCParamList *pl, const char *filename); +void nc_load_state(NCParamList *pl, const char *filename); + +/* SGD optimizer */ + +typedef enum { + SGD_OPT_BASIC, + SGD_OPT_ADAM, + SGD_OPT_TEST, +} SGDOptAlgoEnum; + +typedef struct { + SGDOptAlgoEnum algo; + union { + struct { + float beta1; + float beta2; + float eps; + float gradient_clip; /* if != 0, per parameter gradient clipping */ + } adam; + } u; + float lr; +} SGDOptParams; + +NCSGDOptState *nc_sgd_opt_init(NCContext *m, const SGDOptParams *p); +void nc_sgd_opt_end(NCSGDOptState *s); +void sgd_opt_update_var(void *opaque, NCTensor *yg, NCTensor *get_col_index); + +/* set the SGD optimizer 's' to all parameters of the model */ +void nc_sgd_opt_set_all(NCParamList *param_list, NCSGDOptState *s); + +/* set the SGD optimizer 's' to the variable 'x'. Remove it if s = NULL */ +void nc_sgd_opt_set(NCParam *x, NCSGDOptState *s); +void nc_sgd_opt_update(NCSGDOptState *s); +/* force the learning rate */ +void nc_sgd_opt_set_lr(NCSGDOptState *s, float lr); +float nc_sgd_opt_get_lr(NCSGDOptState *s); + +/* for SGD_OPT_TEST */ +NCTensor *nc_sgd_opt_get_grad(NCParam *p); + +/* misc utilities (to be removed) */ + +typedef struct { + uint32_t seed; + /* used by Gaussian generator */ + int idx; + float y1; +} RNDState; + +typedef struct { + uint16_t u16; +} nc_float16_t; + +void rnd_init(RNDState *s, uint32_t seed); +uint32_t rnd_unif_u32(RNDState *s); +float rnd_unif(RNDState *s); +void rnd_unif_vec(float *tab, size_t n, float mu, float range, + RNDState *s); +void rnd_unif_mat(float *tab, size_t stride, size_t h, size_t w, + float mu, float sigma, RNDState *s); + +float vec_sum_f32(const float *tab, size_t n); + +typedef struct { + float val; + uint32_t idx; +} NCTopKEntry; + +/* Return the k largest values among prob[0...n_symb-1] such that k is + the largest value such that k <= topk and sum(i=0 .. k - 2, + prob[tab[i]]) < topp. + + It is assumed that prob[i] >= 0. The function returns (k, tab, + sum). 'sum' is the sum of the k returned values. 'tab' must be + freed with nc_free(). */ +int nc_topk(NCTopKEntry **ptab, double *psum, + const float *prob, size_t n, int topk, float topp); + +#endif /* LIBNC_H */ diff --git a/gpt2/list.h b/gpt2/list.h new file mode 100644 index 0000000..9ceddef --- /dev/null +++ b/gpt2/list.h @@ -0,0 +1,96 @@ +/* + * Linux klist like system + * + * Copyright (c) 2016-2017 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef LIST_H +#define LIST_H + +struct list_head { + struct list_head *prev; + struct list_head *next; +}; + +#define LIST_HEAD_INIT(el) { &(el), &(el) } + +/* return the pointer of type 'type *' containing 'el' as field 'member' */ +#define list_entry(el, type, member) \ + ((type *)((uint8_t *)(el) - offsetof(type, member))) + +static inline void init_list_head(struct list_head *head) +{ + head->prev = head; + head->next = head; +} + +/* insert 'el' between 'prev' and 'next' */ +static inline void __list_add(struct list_head *el, + struct list_head *prev, struct list_head *next) +{ + prev->next = el; + el->prev = prev; + el->next = next; + next->prev = el; +} + +/* add 'el' at the head of the list 'head' (= after element head) */ +static inline void list_add(struct list_head *el, struct list_head *head) +{ + __list_add(el, head, head->next); +} + +/* add 'el' at the end of the list 'head' (= before element head) */ +static inline void list_add_tail(struct list_head *el, struct list_head *head) +{ + __list_add(el, head->prev, head); +} + +static inline void list_del(struct list_head *el) +{ + struct list_head *prev, *next; + prev = el->prev; + next = el->next; + prev->next = next; + next->prev = prev; + el->prev = NULL; /* fail safe */ + el->next = NULL; /* fail safe */ +} + +static inline int list_empty(struct list_head *el) +{ + return el->next == el; +} + +#define list_for_each(el, head) \ + for(el = (head)->next; el != (head); el = el->next) + +#define list_for_each_safe(el, el1, head) \ + for(el = (head)->next, el1 = el->next; el != (head); \ + el = el1, el1 = el->next) + +#define list_for_each_prev(el, head) \ + for(el = (head)->prev; el != (head); el = el->prev) + +#define list_for_each_prev_safe(el, el1, head) \ + for(el = (head)->prev, el1 = el->prev; el != (head); \ + el = el1, el1 = el->prev) + +#endif /* LIST_H */ diff --git a/gpt2/readme.txt b/gpt2/readme.txt new file mode 100644 index 0000000..3f95757 --- /dev/null +++ b/gpt2/readme.txt @@ -0,0 +1,86 @@ +GPT-2 text completion and compression demo +========================================== + +1) Usage +-------- + +Extract the 117M GPT-2 model to the gpt2tc directory: + +tar xtf gpt2tc-117M.tar.gz + +Text completion example: + +./gpt2tc g "Hello, my name is" + +Use more CPU cores (only faster on server CPUs): + +./gpt2tc -T 4 g "Hello, my name is" + +Short Text compression and decompression example: + +./gpt2tc cs "Hello, how are you ?" + +./gpt2tc ds "姯敳痪" + +Text compression example: + +./gpt2tc c in.txt out.bin + +Decompression: + +./gpt2tc d out.bin out.txt + +2) Using larger models +---------------------- + +The smallest GPT-2 model (117M) is provided in a separate +archive. Larger models can be built by downloading the TensorFlow +parameters and converting them with the attached script. Example: + +# download the model to models/345M +./download_model.sh 345M + +# convert it to the gpt2tc format: +python3 gpt2convert.py models/345M gpt2_345M.bin + +# use it +./gpt2tc -m 345M g "Hello, how are you ?" + +3) Compression results +---------------------- + +File Model Original size Compr. size Ratio CMIX v18 + #params (bytes) (bytes) (bpb) ratio (bpb) +book1 117M 768771 152283 1.58 1.82 +book1 345M 768771 142183 1.48 +book1 774M 768771 137562 1.43 +book1 1558M 768771 134217 1.40 + +alice29.txt 117M 152089 23615 1.24 1.65 +alice29.txt 345M 152089 20587 1.08 +alice29.txt 774M 152089 19096 1.00 +alice29.txt 1558M 152089 17382 0.91 + +enwik5 117M 100000 14875 1.19 1.60 +enwik5 345M 100000 13511 1.08 +enwik5 774M 100000 13240 1.06 +enwik5 1558M 100000 12918 1.03 + +Notes: +- book1 comes from the Calgary corpus. +- alice29.txt comes from the Canterbury corpus. +- enwik5 contains the first 100000 bytes of the English + Wikipedia dump of March 3, 2006 + (http://mattmahoney.net/dc/textdata.html). +- For best performance, use the UTF-8 encoding and don't mix CRLF and + LF line breaks. +- For reference, the results of CMIX + (http://www.byronknoll.com/cmix.html) are provided. + +4) More information +------------------- + +This demo has no external dependency. It is written in C and uses the +LibNC library for tensor manipulation. The CPU must support AVX2. + +A similar program is used for http://textsynth.org/ diff --git a/justlm.hpp b/justlm.hpp new file mode 100644 index 0000000..4347ca7 --- /dev/null +++ b/justlm.hpp @@ -0,0 +1,54 @@ +#ifndef LLM_H +#define LLM_H +#include +#include +#include +#include +#include +#include + + +class LLM { + struct { + int32_t seed; // RNG seed + int32_t n_threads = static_cast(std::thread::hardware_concurrency()) / 2; + union { + int32_t n_ctx; // Context size, llama.cpp specific + int32_t n_prompt = -1; // Prompt size, gpt2 specific + }; + int32_t n_batch = 8; // Batch size, unused + + int32_t top_k = 40; + float top_p = 0.5f; + float temp = 0.72f; + } params; + + struct State *state; + + void init(const std::string& weights_path); + + static + bool ends_with(std::string_view str, std::string_view suffix); + +public: + struct Exception : public std::runtime_error { + using std::runtime_error::runtime_error; + }; + struct ContextLengthException : public Exception { + ContextLengthException() : Exception("Max. context length exceeded") {} + }; + + LLM(const std::string& weights_path, int32_t seed = 0) { + // Set random seed + params.seed = seed?seed:time(NULL); + + // Initialize llm + init(weights_path); + } + ~LLM(); + + void append(std::string_view prompt, const std::function& on_tick = nullptr); + + std::string run(std::string_view end, const std::function& on_tick = nullptr); +}; +#endif // LLM_H diff --git a/libjustlm_core.cpp b/libjustlm_core.cpp new file mode 100644 index 0000000..493313c --- /dev/null +++ b/libjustlm_core.cpp @@ -0,0 +1,9 @@ +#include "justlm.hpp" + +#include + + + +bool LLM::ends_with(std::string_view str, std::string_view suffix) { + return str.size() >= suffix.size() && 0 == str.compare(str.size()-suffix.size(), suffix.size(), suffix); +} diff --git a/libjustlm_gpt2.cpp b/libjustlm_gpt2.cpp new file mode 100644 index 0000000..c29f42c --- /dev/null +++ b/libjustlm_gpt2.cpp @@ -0,0 +1,80 @@ +#include "justlm.hpp" +#include "gpt2/gpt2tc.h" + +#include +#include + + +struct State { + std::string prompt; + std::string model_path; + GPT2ModelEnum model; +} state; + + + +void LLM::init(const std::string& weights_path) { + state->model_path = weights_path; + // Get weight file size + auto weights_size = std::filesystem::file_size(weights_path); + // Determine weight size + switch (weights_size) { + case 250700242: state->model = GPT2_MODEL_117M; break; + case 3120522738: state->model = GPT2_MODEL_1558M; break; + case 712396722: state->model = GPT2_MODEL_345M; break; + case 1551900050: state->model = GPT2_MODEL_774M; break; + default: throw Exception("Unknown model size"); + } +} + +LLM::~LLM() { + delete state; +} + +void LLM::append(std::string_view prompt, const std::function &on_tick) { + state->prompt.append(prompt); + std::cout << prompt << std::endl; +} + +std::string LLM::run(std::string_view end, const std::function &on_tick) { + std::string fres; + TextCompleteGlobalState *tcs; + TextGenContext *ts; + int count; + struct timeval tv; + struct list_head ts_list; + + // Initialize completion + tcs = text_complete_global_init(state->model, state->model_path.c_str()); + + // Run completion + ts = text_complete_start(tcs, state->prompt.c_str(), params.top_k, params.top_p, params.temp, + params.seed, params.n_prompt>0?params.n_prompt:0xfffffff - state->prompt.size()); + bool abort = false; + while (!abort && !ends_with(fres, end)) { + // Run completion + init_list_head(&ts_list); + list_add_tail(&ts->link, &ts_list); + text_complete_next(tcs, &ts_list); + if (ts->out_text_len == 0) + break; + auto str = std::string_view{ts->out_text, static_cast(ts->out_text_len)}; + + // Append result to fres + fres.append(str); + + // Tick + if (on_tick && !on_tick(std::string(str).c_str()) /*Huge overhead in favor of llama.cpp*/) abort = true; + } + // End completion + text_complete_end(ts); + + text_complete_global_end(tcs); + + // Create final string TODO: Could be optimized + state->prompt.append(fres); + fres = std::string(fres.data(), fres.size()-end.size()); + + // Return final string + return fres; +} diff --git a/libjustlm_llama.cpp b/libjustlm_llama.cpp new file mode 100644 index 0000000..c3e2981 --- /dev/null +++ b/libjustlm_llama.cpp @@ -0,0 +1,115 @@ +#include "justlm.hpp" + +#include +#include + + +struct State { + llama_context *ctx = nullptr; + std::string prompt; + std::vector embd; + int n_ctx; + std::string last_result; +} state; + + + +void LLM::init(const std::string& weights_path) { + // Allocate state + state = new State; + + // Get llama parameters + auto lparams = llama_context_default_params(); + lparams.seed = params.seed; + lparams.n_ctx = params.n_ctx>0?params.n_ctx:2024; + + // Create context + state->ctx = llama_init_from_file(weights_path.c_str(), lparams); + if (!state->ctx) { + throw Exception("Failed to initialize llama from file"); + } + + // Initialize some variables + state->n_ctx = llama_n_ctx(state->ctx); +} + +LLM::~LLM() { + if (state->ctx) llama_free(state->ctx); + delete state; +} + +void LLM::append(std::string_view prompt, const std::function &on_tick) { + // Check if prompt was empty + const bool was_empty = state->prompt.empty(); + + // Append to current prompt + state->prompt.append(prompt); + + // Resize buffer for tokens + const auto old_token_count = state->embd.size(); + state->embd.resize(old_token_count+state->prompt.size()+1); + + // Run tokenizer + const auto token_count = llama_tokenize(state->ctx, prompt.data(), state->embd.data()+old_token_count, state->embd.size()-old_token_count, was_empty); + state->embd.resize(old_token_count+token_count); + + // Make sure limit is far from being hit + if (state->embd.size() > state->n_ctx-6) { + // Yup. *this MUST be decomposed now. + throw ContextLengthException(); + } + + // Evaluate new tokens + // TODO: Larger batch size + std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state->embd.size() << '/' << state->n_ctx << std::endl; + for (int it = old_token_count; it != state->embd.size(); it++) { + std::cout << llama_token_to_str(state->ctx, state->embd.data()[it]) << std::flush; + llama_eval(state->ctx, state->embd.data()+it, 1, it, params.n_threads); + + // Tick + if (on_tick) { + // Calculate progress + auto progress = float(it-old_token_count) / (state->embd.size()-old_token_count) * 100.f; + // Run callback + if (!on_tick(progress)) break; + } + } + std::cout << std::endl; +} + +std::string LLM::run(std::string_view end, const std::function &on_tick) { + std::string fres; + + // Loop until done + bool abort = false; + while (!abort && !ends_with(fres, end)) { + // Sample top p and top k + const auto id = llama_sample_top_p_top_k(state->ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f); + + // Add token + state->embd.push_back(id); + + // Get token as string + const auto str = llama_token_to_str(state->ctx, id); + + // Debug + std::cout << str << std::flush; + + // Append string to function result + fres.append(str); + + // Evaluate token + // TODO: Respect batch size + llama_eval(state->ctx, state->embd.data()+state->embd.size()-1, 1, state->embd.size()-1, params.n_threads); + + // Tick + if (on_tick && !on_tick(str)) abort = true; + } + + // Create final string TODO: Could be optimized + state->prompt.append(fres); + fres = std::string(fres.data(), fres.size()-end.size()); + + // Return final string + return fres; +} diff --git a/llama.cpp b/llama.cpp new file mode 160000 index 0000000..9cbc404 --- /dev/null +++ b/llama.cpp @@ -0,0 +1 @@ +Subproject commit 9cbc404ba6699a9ba4925ea25a60552b13491c7a diff --git a/test.cpp b/test.cpp new file mode 100644 index 0000000..94f5868 --- /dev/null +++ b/test.cpp @@ -0,0 +1,12 @@ +#include "ai.hpp" + +#include + + + +int main() { + Ai ai; + std::cout << "Completing \"she replied that\"..." << std::endl; + std::cout << "Using model " << ai.model_name << "..." << std::endl; + std::cout << "> she replied that" << ai.complete("she replied that", '\n') << std::endl; +}