1
0
Fork 0
mirror of https://gitlab.com/niansa/libjustlm.git synced 2025-03-06 20:49:17 +01:00

Initial commit

This commit is contained in:
niansa/tuxifan 2023-03-30 07:03:33 -05:00
commit aaddcc0cbd
21 changed files with 4037 additions and 0 deletions

74
.gitignore vendored Normal file
View file

@ -0,0 +1,74 @@
# This file is used to ignore files which are generated
# ----------------------------------------------------------------------------
*~
*.autosave
*.a
*.core
*.moc
*.o
*.obj
*.orig
*.rej
*.so
*.so.*
*_pch.h.cpp
*_resource.rc
*.qm
.#*
*.*#
core
!core/
tags
.DS_Store
.directory
*.debug
Makefile*
*.prl
*.app
moc_*.cpp
ui_*.h
qrc_*.cpp
Thumbs.db
*.res
*.rc
/.qmake.cache
/.qmake.stash
# qtcreator generated files
*.pro.user*
CMakeLists.txt.user*
# xemacs temporary files
*.flc
# Vim temporary files
.*.swp
# Visual Studio generated files
*.ib_pdb_index
*.idb
*.ilk
*.pdb
*.sln
*.suo
*.vcproj
*vcproj.*.*.user
*.ncb
*.sdf
*.opensdf
*.vcxproj
*vcxproj.*
# MinGW generated files
*.Debug
*.Release
# Python byte code
*.pyc
# Binaries
# --------
*.dll
*.exe

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "llama.cpp"]
path = llama.cpp
url = https://github.com/ggerganov/llama.cpp.git

24
CMakeLists.txt Normal file
View file

@ -0,0 +1,24 @@
cmake_minimum_required(VERSION 3.14)
project(libjustlm LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_library(libjustlm STATIC
libjustlm_core.cpp
justlm.hpp
)
set(LM_BACKEND "llama.cpp" CACHE STRING "The language model backend to use")
if (LM_BACKEND STREQUAL "libnc gpt2")
add_library(libjustlm_gpt2 STATIC libjustlm_gpt2.cpp gpt2/arith.c gpt2/cp_utils.c gpt2/gpt2tc.c)
target_link_libraries(libjustlm_gpt2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gpt2/libnc.so pthread)
elseif (LM_BACKEND STREQUAL "llama.cpp")
add_subdirectory(llama.cpp)
add_library(libjustlm_llama STATIC libjustlm_llama.cpp)
target_link_libraries(libjustlm_llama PRIVATE llama)
else()
message(FATAL_ERROR "LM_BACKEND '${LM_BACKEND}' is unsupported. Please use either 'libnc gpt2' or 'llama.cpp'.")
endif()

1
gpt2/VERSION Normal file
View file

@ -0,0 +1 @@
2021-04-24

301
gpt2/arith.c Normal file
View file

@ -0,0 +1,301 @@
/*
* Arithmetic coder
*
* Copyright (c) 2018-2021 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <inttypes.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include "cutils.h"
#include "arith.h"
#define RANGE_MIN_BITS 16
#define RANGE_MIN ((0xff << (RANGE_MIN_BITS - 8)) + 1)
#define RANGE_MAX (0xff << RANGE_MIN_BITS)
//#define DUMP_PUT_BIT
//#define DUMP_GET_BIT
void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
PutBitWriteFunc *write_func, void *opaque)
{
s->low = 0;
s->range = RANGE_MAX;
s->current_byte = 0xff;
s->n_bytes = 0;
s->buf = buf;
s->buf_size = buf_size;
s->idx = 0;
s->write_func = write_func;
s->opaque = opaque;
s->byte_count = 0;
assert(PROB_UNIT <= RANGE_MIN);
}
static void put_byte(PutBitState *s, int v)
{
s->buf[s->idx++] = v;
if (unlikely(s->idx == s->buf_size)) {
s->byte_count += s->idx;
s->write_func(s->opaque, s->buf, s->idx);
s->idx = 0;
}
}
/* 0 <= v <= 0x1fe. The current output stream contains n_bytes with:
current_byte, then (n_bytes - 1) x 0xff
*/
static void put_val(PutBitState *s, int v)
{
uint32_t carry, b;
#ifdef DUMP_PUT_BIT
printf(" out=%d\n", v);
#endif
if (v == 0xff) {
s->n_bytes++;
} else {
if (s->n_bytes > 0) {
carry = v >> 8;
put_byte(s, s->current_byte + carry);
b = (0xff + carry) & 0xff;
while (s->n_bytes > 1) {
put_byte(s, b);
s->n_bytes--;
}
}
s->n_bytes = 1;
s->current_byte = v;
}
}
static void put_val_flush(PutBitState *s)
{
if (s->n_bytes > 0) {
put_val(s, 0);
}
}
static void put_bit_renorm(PutBitState *s)
{
uint32_t v;
/* after renormalisation:
0 <= low <= RANGE_MAX
RANGE_MIN <= range <= RANGE_MAX
In the worst case before normalisation:
low_max = 2 * RANGE_MAX hence v <= 0x1fe
*/
while (s->range < RANGE_MIN) {
v = s->low >> RANGE_MIN_BITS;
put_val(s, v);
s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
s->range <<= 8;
}
}
/* 0 < prob0 < PROB_UNIT */
void put_bit(PutBitState *s, int prob0, int bit)
{
int range0;
assert(s->range >= RANGE_MIN);
range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
assert(range0 > 0);
assert(range0 < s->range);
#if defined(DUMP_PUT_BIT)
{
static int count;
printf("%d: range=%d b=%d range0=%d low=%d\n",
count++, s->range, bit, range0, s->low);
}
#endif
if (!bit) {
s->range = range0;
} else {
s->low += range0;
s->range -= range0;
}
put_bit_renorm(s);
}
void put_bit_raw(PutBitState *s, int bit)
{
int range0;
assert(s->range >= RANGE_MIN);
range0 = s->range >> 1;
if (!bit) {
s->range = range0;
} else {
s->low += range0;
s->range -= range0;
}
put_bit_renorm(s);
}
/* return the minimum number of bits to be able to correctly decode */
int64_t put_bit_flush(PutBitState *s)
{
int n, val, mask;
/* force larger range */
if (s->range < (1 << RANGE_MIN_BITS)) {
put_val(s, s->low >> RANGE_MIN_BITS);
s->low = (s->low & ((1 << RANGE_MIN_BITS) - 1)) << 8;
s->range <<= 8;
}
/* largest n such as 2^n <= range */
n = 0;
while ((1 << (n + 1)) <= s->range)
n++;
assert(n >= RANGE_MIN_BITS && n <= (RANGE_MIN_BITS + 7));
val = s->low;
mask = (1 << n) - 1;
if ((val & mask) != 0)
val = (val + (1 << n)) & ~mask;
assert(val >= s->low && val < s->low + s->range);
put_val(s, val >> RANGE_MIN_BITS);
put_val_flush(s);
if (s->idx > 0) {
s->byte_count += s->idx;
s->write_func(s->opaque, s->buf, s->idx);
s->idx = 0;
}
return (s->byte_count - 1) * 8 + (RANGE_MIN_BITS + 8 - n);
}
/* return the approximate number of written bits */
int64_t put_bit_get_bit_count(PutBitState *s)
{
int n;
n = 0;
while ((1 << (n + 1)) <= s->range)
n++;
return (s->byte_count + s->idx) * 8 + (RANGE_MIN_BITS + 7 - n);
}
/****************************************/
static void refill(GetBitState *s)
{
s->range <<= 8;
s->low <<= 8;
if (s->idx >= s->buf_len) {
if (!s->read_func)
return; /* pad with zeros */
s->buf_len = s->read_func(s->opaque, s->buf, s->buf_size);
s->byte_count += s->buf_len;
s->idx = 0;
}
#ifdef DUMP_GET_BIT
printf(" in=%d\n", s->buf[s->idx]);
#endif
s->low += s->buf[s->idx++];
}
void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
GetBitReadFunc *read_func, void *opaque)
{
int i;
s->buf_size = buf_size;
s->buf = buf;
s->read_func = read_func;
s->opaque = opaque;
if (read_func) {
s->buf_len = 0;
} else {
/* prefilled buffer */
s->buf_len = s->buf_size;
}
s->byte_count = s->buf_len;
s->range = 0;
s->low = 0;
s->idx = 0;
for(i = 0; i <= RANGE_MIN_BITS; i += 8) {
refill(s);
}
s->range = RANGE_MAX;
}
/* 0 < prob0 < PROB_UNIT */
int get_bit(GetBitState *s, int prob0)
{
int b, range0;
assert(s->range >= RANGE_MIN);
range0 = ((uint64_t)s->range * prob0) >> PROB_UNIT_BITS;
assert(range0 > 0);
assert(range0 < s->range);
b = s->low >= range0;
#ifdef DUMP_GET_BIT
{
static int count;
printf("%d: range=%d b=%d range0=%d low=%d\n", count++, s->range, b, range0, s->low);
}
#endif
if (b) {
s->low -= range0;
s->range -= range0;
} else {
s->range = range0;
}
while (s->range < RANGE_MIN)
refill(s);
return b;
}
/* no context */
int get_bit_raw(GetBitState *s)
{
int b, range0;
range0 = s->range >> 1;
b = s->low >= range0;
if (b) {
s->low -= range0;
s->range -= range0;
} else {
s->range = range0;
}
if (s->range < RANGE_MIN)
refill(s);
return b;
}
/* return the approximate number of read bits */
int64_t get_bit_get_bit_count(GetBitState *s)
{
int n;
n = 0;
while ((1 << (n + 1)) <= s->range)
n++;
return (s->byte_count - s->buf_len + s->idx) * 8 - n;
}

73
gpt2/arith.h Normal file
View file

@ -0,0 +1,73 @@
/*
* Arithmetic coder
*
* Copyright (c) 2018-2019 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef ARITH_H
#define ARITH_H
#define PROB_UNIT_BITS 15
#define PROB_UNIT (1 << PROB_UNIT_BITS)
typedef void PutBitWriteFunc(void *opaque, const uint8_t *buf, size_t buf_size);
typedef struct {
uint32_t range;
uint32_t low;
uint8_t current_byte;
uint32_t n_bytes;
uint8_t *buf;
size_t buf_size;
size_t idx; /* current position in bytes */
PutBitWriteFunc *write_func;
void *opaque;
uint64_t byte_count;
} PutBitState;
void put_bit_init(PutBitState *s, uint8_t *buf, int buf_size,
PutBitWriteFunc *write_func, void *opaque);
void put_bit(PutBitState *s, int prob0, int bit);
void put_bit_raw(PutBitState *s, int bit);
int64_t put_bit_flush(PutBitState *s);
int64_t put_bit_get_bit_count(PutBitState *s);
/* return the number of read bytes */
typedef ssize_t GetBitReadFunc(void *opaque, uint8_t *buf, size_t buf_size);
typedef struct {
uint8_t *buf;
int buf_len;
int buf_size;
int idx;
uint32_t low;
uint32_t range;
GetBitReadFunc *read_func;
void *opaque;
uint64_t byte_count;
} GetBitState;
void get_bit_init(GetBitState *s, uint8_t *buf, size_t buf_size,
GetBitReadFunc *read_func, void *opaque);
int get_bit(GetBitState *s, int prob0);
int get_bit_raw(GetBitState *s);
int64_t get_bit_get_bit_count(GetBitState *s);
#endif /* ARITH_H */

316
gpt2/cp_utils.c Normal file
View file

@ -0,0 +1,316 @@
/*
* Compression utilities
*
* Copyright (c) 2018-2019 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <inttypes.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <stdarg.h>
#include <sys/time.h>
#include <sys/stat.h>
#ifdef _WIN32
#include <direct.h>
#endif
#include "cutils.h"
#include "libnc.h"
#include "cp_utils.h"
void fatal_error(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
fprintf(stderr, "Fatal error: ");
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
exit(1);
}
int64_t get_time_ms(void)
{
#ifdef _WIN32
struct timeval tv;
gettimeofday(&tv, NULL);
return (int64_t)tv.tv_sec * 1000 + (tv.tv_usec / 1000U);
#else
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (int64_t)ts.tv_sec * 1000 + (ts.tv_nsec / 1000000U);
#endif
}
void fput_u8(FILE *f, uint8_t v)
{
fputc(v, f);
}
int fget_u8(FILE *f, uint8_t *pv)
{
int c;
c = fgetc(f);
if (c < 0)
return -1;
*pv = c;
return 0;
}
void fput_be16(FILE *f, uint16_t v)
{
fputc(v >> 8, f);
fputc(v >> 0, f);
}
int fget_be16(FILE *f, uint16_t *pv)
{
uint8_t buf[2];
if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
return -1;
*pv = (buf[0] << 8) |
(buf[1] << 0);
return 0;
}
void fput_be32(FILE *f, uint32_t v)
{
fputc(v >> 24, f);
fputc(v >> 16, f);
fputc(v >> 8, f);
fputc(v >> 0, f);
}
int fget_be32(FILE *f, uint32_t *pv)
{
uint8_t buf[4];
if (fread(buf, 1, sizeof(buf), f) != sizeof(buf))
return -1;
*pv = (buf[0] << 24) |
(buf[1] << 16) |
(buf[2] << 8) |
(buf[3] << 0);
return 0;
}
void fput_sgd_opt(FILE *f, const SGDOptParams *p)
{
fput_u8(f, p->algo);
switch(p->algo) {
case SGD_OPT_BASIC:
break;
case SGD_OPT_ADAM:
fput_f32(f, p->u.adam.beta1);
fput_f32(f, p->u.adam.beta2);
fput_f32(f, p->u.adam.eps);
fput_f32(f, p->u.adam.gradient_clip);
break;
default:
abort();
}
}
int fget_sgd_opt(FILE *f, SGDOptParams *p)
{
uint8_t v8;
if (fget_u8(f, &v8))
return -1;
p->algo = v8;
switch(p->algo) {
case SGD_OPT_BASIC:
break;
case SGD_OPT_ADAM:
if (fget_f32(f, &p->u.adam.beta1))
return -1;
if (fget_f32(f, &p->u.adam.beta2))
return -1;
if (fget_f32(f, &p->u.adam.eps))
return -1;
if (fget_f32(f, &p->u.adam.gradient_clip))
return -1;
break;
default:
return -1;
}
return 0;
}
void dump_sgd_opt_params(FILE *f, const SGDOptParams *p)
{
switch(p->algo) {
case SGD_OPT_BASIC:
fprintf(f, " sgd_opt=%s",
"none");
break;
case SGD_OPT_ADAM:
fprintf(f, " sgd_opt=%s beta1=%g beta2=%g eps=%g gclip=%g",
"adam",
p->u.adam.beta1,
p->u.adam.beta2,
p->u.adam.eps,
p->u.adam.gradient_clip);
break;
default:
abort();
}
}
typedef union {
float f;
uint32_t u32;
} f32;
void fput_f32(FILE *f, float v)
{
f32 u;
u.f = v;
fput_be32(f, u.u32);
}
int fget_f32(FILE *f, float *pv)
{
f32 u;
if (fget_be32(f, &u.u32))
return -1;
*pv = u.f;
return 0;
}
void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym)
{
int start, range, prob0, bit, range0;
float p, p0;
start = 0;
range = n_symb;
p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
while (range > 1) {
range0 = range >> 1;
p0 = vec_sum_f32(prob_table + start, range0);
prob0 = lrintf(p0 * PROB_UNIT / p);
prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
bit = sym >= (start + range0);
put_bit(pb, prob0, bit);
if (bit) {
start += range0;
range = range - range0;
p = p - p0;
} else {
p = p0;
range = range0;
}
}
}
int read_sym(GetBitState *gb, const float *prob_table, int n_symb)
{
int start, range, prob0, bit, range0;
float p, p0;
start = 0;
range = n_symb;
p = 1.0; /* invariant: p=sum(prob_table[start...start + range]) */
while (range > 1) {
range0 = range >> 1;
p0 = vec_sum_f32(prob_table + start, range0);
prob0 = lrintf(p0 * PROB_UNIT / p);
prob0 = clamp_int(prob0, 1, PROB_UNIT - 1);
bit = get_bit(gb, prob0);
if (bit) {
start += range0;
range = range - range0;
p = p - p0;
} else {
p = p0;
range = range0;
}
}
return start;
}
void create_debug_dir(char *debug_dir, size_t debug_dir_size,
const char *debug_path, const char *prefix)
{
char name1[1024];
struct tm *tm;
time_t ti;
snprintf(name1, sizeof(name1), "%s/%s", debug_path, prefix);
#ifdef _WIN32
_mkdir(name1);
#else
mkdir(name1, 0777);
#endif
ti = time(NULL);
tm = localtime(&ti);
snprintf(debug_dir, debug_dir_size, "%s/%04u%02u%02u-%02u%02u%02u",
name1,
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
#ifdef _WIN32
_mkdir(debug_dir);
#else
mkdir(debug_dir, 0777);
#endif
}
/* we print at least 3 significant digits with at most 5 chars, except
if larger than 9999T. The value is rounded to zero. */
char *get_si_prefix(char *buf, int buf_size, uint64_t val)
{
static const char suffixes[4] = "kMGT";
uint64_t base;
int i;
if (val <= 999) {
snprintf(buf, buf_size, "%" PRId64, val);
} else {
base = 1000;
for(i=0;i<4;i++) {
/* Note: we round to 0 */
if (val < base * 10) {
snprintf(buf, buf_size, "%0.2f%c",
floor((val * 100.0) / base) / 100.0,
suffixes[i]);
break;
} else if (val < base * 100) {
snprintf(buf, buf_size, "%0.1f%c",
floor((val * 10.0) / base) / 10.0,
suffixes[i]);
break;
} else if (val < base * 1000 || (i == 3)) {
snprintf(buf, buf_size,
"%" PRId64 "%c",
val / base,
suffixes[i]);
break;
}
base = base * 1000;
}
}
return buf;
}

48
gpt2/cp_utils.h Normal file
View file

@ -0,0 +1,48 @@
/*
* Compression utilities
*
* Copyright (c) 2018-2019 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "arith.h"
#include "libnc.h"
void __attribute__((noreturn, format(printf, 1, 2))) fatal_error(const char *fmt, ...);
int64_t get_time_ms(void);
void fput_u8(FILE *f, uint8_t v);
int fget_u8(FILE *f, uint8_t *pv);
void fput_be16(FILE *f, uint16_t v);
int fget_be16(FILE *f, uint16_t *pv);
void fput_be32(FILE *f, uint32_t v);
int fget_be32(FILE *f, uint32_t *pv);
void fput_f32(FILE *f, float v);
int fget_f32(FILE *f, float *pv);
void fput_sgd_opt(FILE *f, const SGDOptParams *p);
int fget_sgd_opt(FILE *f, SGDOptParams *p);
void dump_sgd_opt_params(FILE *f, const SGDOptParams *p);
void write_sym(PutBitState *pb, const float *prob_table, int n_symb, int sym);
int read_sym(GetBitState *gb, const float *prob_table, int n_symb);
void create_debug_dir(char *debug_dir, size_t debug_dir_size,
const char *debug_path, const char *prefix);
char *get_si_prefix(char *buf, int buf_size, uint64_t val);

152
gpt2/cutils.h Normal file
View file

@ -0,0 +1,152 @@
#ifndef CUTILS_H
#define CUTILS_H
#include <inttypes.h>
#define force_inline inline __attribute__((always_inline))
#define no_inline __attribute__((noinline))
#define __unused __attribute__((unused))
#define xglue(x, y) x ## y
#define glue(x, y) xglue(x, y)
#ifndef offsetof
#define offsetof(type, field) ((size_t) &((type *)0)->field)
#endif
#define countof(x) (sizeof(x) / sizeof(x[0]))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
typedef int BOOL;
#ifndef FALSE
enum {
FALSE = 0,
TRUE = 1,
};
#endif
typedef struct {
uint16_t u16;
} bfloat16_t;
#if defined(__x86_64__)
static inline int64_t get_cycles(void)
{
uint32_t low,high;
int64_t val;
asm volatile("rdtsc" : "=a" (low), "=d" (high));
val = high;
val <<= 32;
val |= low;
return val;
}
#else
static inline int64_t get_cycles(void)
{
int64_t val;
asm volatile ("rdtsc" : "=A" (val));
return val;
}
#endif
static inline int max_int(int a, int b)
{
if (a > b)
return a;
else
return b;
}
static inline int min_int(int a, int b)
{
if (a < b)
return a;
else
return b;
}
static inline size_t max_size_t(size_t a, size_t b)
{
if (a > b)
return a;
else
return b;
}
static inline size_t min_size_t(size_t a, size_t b)
{
if (a < b)
return a;
else
return b;
}
static inline ssize_t max_ssize_t(ssize_t a, ssize_t b)
{
if (a > b)
return a;
else
return b;
}
static inline ssize_t min_ssize_t(ssize_t a, ssize_t b)
{
if (a < b)
return a;
else
return b;
}
static inline int clamp_int(int val, int min_val, int max_val)
{
if (val < min_val)
return min_val;
else if (val > max_val)
return max_val;
else
return val;
}
static inline float clamp_float(float val, float min_val, float max_val)
{
if (val < min_val)
return min_val;
else if (val > max_val)
return max_val;
else
return val;
}
/* WARNING: undefined if a = 0 */
static inline int clz32(unsigned int a)
{
return __builtin_clz(a);
}
/* WARNING: undefined if a = 0 */
static inline int clz64(uint64_t a)
{
return __builtin_clzll(a);
}
static inline int floor_log2(uint64_t a)
{
return 63 - clz64(a);
}
static inline int ceil_log2(uint64_t a)
{
if (a <= 1)
return 0;
else
return 64 - clz64(a - 1);
}
static inline float squaref(float x)
{
return x * x;
}
#define DUP8(a) a, a, a, a, a, a, a, a
#endif /* CUTILS_H */

2023
gpt2/gpt2tc.c Normal file

File diff suppressed because it is too large Load diff

143
gpt2/gpt2tc.h Normal file
View file

@ -0,0 +1,143 @@
#ifndef _GPT2TC_H
#define _GPT2TC_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
#include "cutils.h"
#include "arith.h"
#include "cp_utils.h"
#include "list.h"
#include "libnc.h"
#define MAX_INITIAL_TEXT_LEN 256 /* in symbols */
#define MAX_OUTPUT_LEN 100
#define DEFAULT_TOP_K 40
#define DEFAULT_TOP_P 0.9
#define BATCH_SIZE_MAX 16
//#define BATCH_SIZE_MAX 1
typedef uint16_t DataSymbol;
typedef enum {
GPT2_MODEL_117M,
GPT2_MODEL_345M,
GPT2_MODEL_774M,
GPT2_MODEL_1558M,
} GPT2ModelEnum;
typedef struct {
BOOL is_decoder;
int n_layer;
int d_model;
int n_head;
int d_key;
int d_value;
int d_inner;
int n_ctx;
int n_symbols;
uint32_t seed;
} TransformerModelParams;
typedef struct {
NCTensor *ln_1_g, *ln_1_b;
NCTensor *attn_w, *attn_b;
NCTensor *attn_proj_w, *attn_proj_b;
NCTensor *ln_2_g, *ln_2_b;
NCTensor *mlp_fc_w, *mlp_fc_b;
NCTensor *mlp_proj_w, *mlp_proj_b;
} TransformerLayer;
typedef struct {
RNDState rnd_state;
NCContext *model;
NCDevice *device;
int n_layer;
int d_model;
int n_head;
int d_key;
int d_value;
int d_inner;
int n_symbols;
int n_ctx;
/* parameters */
NCParamList param_list;
TransformerLayer *layers;
NCTensor *wte, *wpe, *wte_trans;
NCTensor *ln_f_g, *ln_f_b;
} TransformerModel;
typedef struct Word {
uint32_t next; /* -1 = end */
uint32_t len;
uint8_t *buf;
} Word;
typedef struct {
Word *words;
size_t word_count;
size_t word_size;
uint32_t *hash_table;
int hash_size;
int hash_bits;
} WordList;
typedef struct {
TransformerModel *trf_state;
WordList *wl;
} TextCompleteGlobalState;
typedef struct {
struct list_head link;
TextCompleteGlobalState *global_state;
int top_k;
float top_p;
float temperature;
RNDState rnd_state;
NCTensor **mem_k, **mem_v;
DataSymbol *input_buf;
int input_buf_len;
int text_len; /* current input text len */
BOOL is_first;
int last_c;
int max_output_len;
/* output */
char out_text[1024];
int out_text_len; /* 0 means end of output */
} TextGenContext;
GPT2ModelEnum parse_model(const char *str);
void trf_set_params(TransformerModelParams *p, GPT2ModelEnum model);
void gpt2_pp_encode(const char *word_filename, const char *in_filename, const char *out_filename);
size_t gpt2_pp_encode_buf(WordList *s, DataSymbol **pout_buf, const uint8_t *buf, size_t buf_size);
void gpt2_pp_decode(const char *word_filename, const char *in_filename, const char *out_filename);
char *trim_text(const char *str);
TextCompleteGlobalState *text_complete_global_init(GPT2ModelEnum model, const char *filename);
void text_complete_global_end(TextCompleteGlobalState *tcs);
TextGenContext *text_complete_start(TextCompleteGlobalState *tcs, const char *input_text, int top_k, float top_p, float temperature, int seed, int max_output_len);
void text_complete_next(TextCompleteGlobalState *tcs, struct list_head *ts_list);
void text_complete_end(TextGenContext *ts);
void text_complete(GPT2ModelEnum model, const char *model_filename, const char *input_text, int top_k, float top_p, float temperature, int max_output_len, int batch_size, int seed, BOOL verbose);
int unicode_to_utf8(uint8_t *buf, unsigned int c);
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
size_t convert_to_chars(char **pout_buf, uint8_t *buf, size_t n_bits);
ssize_t convert_from_chars(uint8_t **pout_buf, const char *str);
int encode_length(PutBitState *pb, uint32_t val);
int decode_length(GetBitState *gb);
int text_decompress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text);
int text_compress(TextCompleteGlobalState *tcs, char **poutput_text, const char *input_text, BOOL dump_stats);
void text_compress_test(GPT2ModelEnum model, const char *model_filename, const char *input_text, BOOL is_decode, BOOL verbose);
int file_compress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
int file_decompress(TextCompleteGlobalState *tcs, const char *infilename, const char *outfilename);
#ifdef __cplusplus
}
#endif
#endif

BIN
gpt2/gpt2vocab.txt Normal file

Binary file not shown.

426
gpt2/libnc.h Normal file
View file

@ -0,0 +1,426 @@
/*
* LibNC
*
* Copyright (c) 2018-2019 Fabrice Bellard
*
*/
#ifndef LIBNC_H
#define LIBNC_H
#include <inttypes.h>
#include "cutils.h"
#include "list.h"
/* profiling */
typedef enum {
PROF_EVAL,
PROF_GRAD,
PROF_SGD,
PROF_UPDATE,
PROF_WRITE_SYM,
PROF_PROBE,
PROF_TOTAL,
PROF_COUNT,
} ProfEnum;
#ifdef PROFILE
extern int64_t prof_cycles[PROF_COUNT];
extern int64_t prof_samples[PROF_COUNT];
extern int64_t prof_ops[PROF_COUNT];
static inline void prof_start(int idx)
{
prof_cycles[idx] -= get_cycles();
}
static inline void prof_end(int idx)
{
prof_cycles[idx] += get_cycles();
prof_samples[idx]++;
}
static inline void prof_end_ops(int idx, int n_ops)
{
prof_cycles[idx] += get_cycles();
prof_ops[idx] += n_ops;
prof_samples[idx]++;
}
#else
static inline void prof_start(int idx)
{
}
static inline void prof_end(int idx)
{
}
static inline void prof_end_ops(int idx, int n_ops)
{
}
#endif
void nc_prof_dump(void);
/* Automatic Differentiation Engine */
typedef struct NCContext NCContext;
typedef struct NCDevice NCDevice;
typedef struct NCTensor NCTensor;
typedef struct NCTensorBuffer NCTensorBuffer;
typedef struct NCNode NCNode;
typedef struct NCRNDState NCRNDState;
typedef struct NCSGDOptState NCSGDOptState;
typedef enum {
NC_TYPE_F32,
NC_TYPE_BF16,
NC_TYPE_F16,
NC_TYPE_I8,
NC_TYPE_I16,
NC_TYPE_I32,
NC_TYPE_COUNT,
} NCTypeEnum;
extern size_t nc_type_size_table[NC_TYPE_COUNT];
extern const char *nc_type_name_table[NC_TYPE_COUNT];
#define NC_N_DIMS_MAX 4 /* maximum number of axis for tensors */
typedef struct NCTensorData {
NCTypeEnum item_type;
size_t item_size;
void *data;
size_t stride; /* in elements */
size_t n_strides; /* prod(j = 1 ... n_dims, dims[j]); */
int n_dims;
const size_t *dims; /* n_dims length */
const size_t *strides; /* n_dims length, strides in bytes */
} NCTensorData;
void *nc_malloc(size_t size);
void *nc_mallocz(size_t size);
void nc_free(void *ptr);
NCContext *nc_context_init(int nb_threads);
void nc_context_end(NCContext *m);
NCDevice *nc_new_cpu_device(NCContext *m);
NCDevice *nc_new_cuda_device(NCContext *m, int device_index);
NCDevice *nc_new_device(NCContext *m, const char *device_name);
void nc_synchronize(NCDevice *d);
NCTensorBuffer *nc_new_tensor_buffer(NCDevice *d, size_t size);
NCTensorBuffer *nc_dup_tensor_buffer(const NCTensorBuffer *b);
void nc_free_tensor_buffer(NCTensorBuffer *b);
NCTensor *nc_new_tensor(NCDevice *d, NCTypeEnum type,
int n_dims, const size_t *dims);
NCTensor *nc_new_tensor_from_tensor(const NCTensor *x);
NCTensor *nc_new_tensor_from_tensor_nz(const NCTensor *x);
NCTensor *nc_new_scalar(NCDevice *d, NCTypeEnum type);
NCTensor *nc_new_tensor_1d(NCDevice *d, NCTypeEnum type, size_t len);
NCTensor *nc_new_tensor_2d(NCDevice *d, NCTypeEnum type, size_t n0, size_t n1);
NCTensor *nc_new_tensor_3d(NCDevice *d, NCTypeEnum type,
size_t n0, size_t n1, size_t n2);
NCTensor *nc_new_tensor_4d(NCDevice *d, NCTypeEnum type,
size_t n0, size_t n1, size_t n2, size_t n3);
NCTensor *__attribute__((format(printf, 2, 3))) nc_tensor_set_name(NCTensor *x, const char *fmt, ...);
NCTensor *nc_dup_tensor(const NCTensor *x);
void nc_free_tensor(NCTensor *x);
void nc_dump_tensor(const char *name, NCTensor *x, size_t n);
uint32_t nc_tensor_get_hash(NCTensor *x);
void nc_dump_tensor_hash(const char *name, const NCTensor *x);
NCNode *nc_get_node(NCTensor *x);
/* create an alias to tensor 'x1'. Gradient is not propagated thru it */
NCTensor *nc_slice_alias(const NCTensor *x1, int axis, size_t start, size_t end);
NCTypeEnum nc_tensor_get_item_type(const NCTensor *x);
NCTensorData *nc_tensor_get_data(NCTensorData *sd, const NCTensor *x);
/* Return a pointer to the tensor data. If *pstride is non NULL,
return the stride (in elements) of the first dimension. */
void *nc_tensor_get_ptr(NCTensor *x, size_t *pstride);
const size_t *nc_tensor_get_dims(const NCTensor *x, int *pn_dims);
void nc_tensor_set_zero(NCTensor *y);
void nc_tensor_set_f32(NCTensor *y, float val);
NCRNDState *nc_rnd_init(NCDevice *d, uint32_t seed);
void nc_rnd_end(NCRNDState *s);
void nc_tensor_set_rnd_unif(NCTensor *y, float avg, float range,
NCRNDState *rnd_state);
void nc_tensor_set_dropout(NCTensor *y, float prob, NCRNDState *rnd_state);
void nc_set1_i32(NCTensor *y, int n_dims, const size_t *tab_indexes,
int32_t val);
void nc_set1_i32_1d(NCTensor *y, size_t i0, int32_t val);
void nc_set1_i32_2d(NCTensor *y, size_t i0, size_t i1, int32_t val);
void nc_set1_f32(NCTensor *y, int n_dims, const size_t *tab_indexes,
float val);
void nc_set1_f32_1d(NCTensor *y, size_t i0, float val);
int32_t nc_get1_i32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
float nc_get1_f32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
float nc_get1_f32_1d(const NCTensor *x, size_t i0);
float nc_get_scalar_f32(const NCTensor *x);
void nc_tensor_copy(NCTensor *dst, NCTensor *src);
void nc_tensor_convert(NCTensor *dst, NCTensor *src);
void nc_dump_dims(const char *str, NCTensor *x);
size_t nc_get_heap_size(NCContext *m);
NCContext *nc_get_tensor_context(const NCTensor *x);
NCTensor *nc_tensor_to_device(NCTensor *x, NCDevice *d);
NCTensor *nc_tensor_to_cpu_device(NCTensor *x);
NCDevice *nc_get_tensor_device(const NCTensor *x);
/* element wise operations */
NCTensor *nc_convert(NCTensor *x, NCTypeEnum new_type);
NCTensor *nc_add(NCTensor *x1, NCTensor *x2);
NCTensor *nc_neg(NCTensor *x);
NCTensor *nc_sub(NCTensor *x1, NCTensor *x2);
NCTensor *nc_mul(NCTensor *x1, NCTensor *x2);
NCTensor *nc_div(NCTensor *x1, NCTensor *x2);
NCTensor *nc_recip(NCTensor *x);
NCTensor *nc_min(NCTensor *x1, NCTensor *x2);
NCTensor *nc_max(NCTensor *x1, NCTensor *x2);
/* select x1[i] if z[i] = 0 and x2[i] otherwise */
NCTensor *nc_select(NCTensor *z, NCTensor *x1, NCTensor *x2);
/* set y[i] = x1[i] if mask[i] = 0 and y[i] = c if mask[i] != 0. If
mask_inv is TRUE, 'mask' is inverted */
NCTensor *nc_masked_fill(NCTensor *x, NCTensor *mask, float c, BOOL mask_inv);
NCTensor *nc_sigmoid(NCTensor *x);
NCTensor *nc_tanh(NCTensor *x);
NCTensor *nc_relu(NCTensor *x);
NCTensor *nc_gelu(NCTensor *x);
NCTensor *nc_log(NCTensor *x);
/* return cp * fg + min(1 - fg, ig) * in */
NCTensor *nc_lstm_clamped(NCTensor *cp, NCTensor *in,
NCTensor *fg, NCTensor *ig);
/* return a * (1 - t) + b * t */
NCTensor *nc_lerp(NCTensor *a, NCTensor *b, NCTensor *t);
/* other operations */
NCTensor *nc_new_vec_f32(NCDevice *d, size_t n, float val);
NCTensor *nc_new_f32(NCDevice *d, float val);
NCTensor *nc_reshape(NCTensor *x, int n_dims, const size_t *dims);
NCTensor *nc_reshape_1d(NCTensor *x, size_t n0);
NCTensor *nc_reshape_2d(NCTensor *x, size_t n0, size_t n1);
NCTensor *nc_reshape_3d(NCTensor *x, size_t n0, size_t n1, size_t n2);
NCTensor *nc_reshape_4d(NCTensor *x, size_t n0, size_t n1, size_t n2,
size_t n3);
/* duplicate the tensor by adding n_dims dimensions */
NCTensor *nc_repeat(NCTensor *x, int n_dims, const size_t *dims);
NCTensor *nc_repeat_1d(NCTensor *x, size_t n);
/* return y0 + sum over the dimensions > n_dims of 'x'. y0 = NULL
is supported */
NCTensor *nc_reduce_sum(NCTensor *y0, NCTensor *x, int n_dims);
/* sum all the elements of a tensor */
NCTensor *nc_sum(NCTensor *x);
/* sum of squares */
NCTensor *nc_reduce_sum_sqr(NCTensor *x);
NCTensor *nc_slice(NCTensor *x, int axis, size_t start, size_t end);
NCTensor *nc_slice_add(NCTensor *y0, NCTensor *x, int axis, size_t start);
/* concatenation along axis 'axis' */
NCTensor *nc_concat(NCTensor **inputs, int n_inputs, int axis);
/* shortcut for axis = 0 */
NCTensor *nc_vconcat(NCTensor **inputs, int n_inputs);
/* shortcut for axis = 1 */
NCTensor *nc_hconcat(NCTensor **inputs, int n_inputs);
/* split along axis 'axis'. If tab_size = NULL, split equally. */
void nc_split(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size, int axis);
/* shortcut for axis = 0 */
void nc_vsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size);
/* shortcut for axis = 1 */
void nc_hsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size);
typedef enum {
NC_PAD_ZERO,
NC_PAD_DUP, /* duplicate element */
/* trim types, dual to padding */
NC_TRIM_NORMAL = NC_PAD_ZERO,
NC_TRIM_SUM, /* add trimmed elements to the edge */
} NCPadEnum;
/* pad (len > 0) or trim (len < 0) the axis 0 of 'x' */
NCTensor *nc_pad(NCTensor *x, ssize_t left_len, NCPadEnum left_op,
ssize_t right_len, NCPadEnum right_op);
/* shortcut to nc_pad() */
NCTensor *nc_resize(NCTensor *x, size_t n);
/* if x is not contiguous then create a new contiguous tensor and copy
x to it. Otherwise, return 'x'. */
NCTensor *nc_make_contiguous(NCTensor *x);
/* Return a new tensor sharing the same buffer as 'x' with the permuted
dimensions. axis[i] is the corresponding axis in 'x' */
NCTensor *nc_permute_alias(NCTensor *x, int n_dims, const int *axis);
/* same as nc_permute_alias but calls nc_make_contiguous after. */
NCTensor *nc_permute(NCTensor *x, int n_dims, const int *axis);
/* special case of nc_permute() */
NCTensor *nc_transpose(NCTensor *x);
NCTensor *nc_matmul(NCTensor *w, NCTensor *x);
/* return w*x + y0. w and x can be optionally transposed. y0 can be NULL */
NCTensor *nc_matmul_add(NCTensor *w, NCTensor *x, NCTensor *y0,
BOOL w_trans, BOOL x_trans);
NCTensor *nc_matmul_stride(NCTensor *w, NCTensor *x);
/* return a matrix where each column is the column x[i] of matrix 'w' */
NCTensor *nc_get_col(NCTensor *w, NCTensor *x);
/* add the vectors 'z' at column number 'x' in matrix 'w'. */
NCTensor *nc_add_col(NCTensor *z, NCTensor *x, NCTensor *w);
/* select the x-th element in each column of 'w' */
NCTensor *nc_get_element(NCTensor *w, NCTensor *x);
/* add z to the x-th element in each column of 'w' */
NCTensor *nc_add_element(NCTensor *z, NCTensor *x, NCTensor *w);
NCTensor *nc_soft_max(NCTensor *x);
/* Equivalent to y = log(get_element(x, eout)). It is expected to be
used as nc_index_log(nc_soft_max(x), eout) so that the gradient
computation is optimized. */
NCTensor *nc_indexed_log(NCTensor *x, NCTensor *eout);
NCTensor *nc_layer_norm(NCTensor *x, float eps);
NCTensor *nc_rms_norm(NCTensor *x, float eps);
NCTensor *nc_slt_mat_set(NCTensor *x, size_t pos, float c);
/* shift the column 'i' by 'pos + i * mult' elements and pad with with zeros */
NCTensor *nc_rel_shift(NCTensor *x, ssize_t pos, ssize_t mult);
/* auto differentiation */
/* get_col_index is non NULL in the sparse gradient case */
typedef void NCParamUpdateFunc(void *opaque, NCTensor *grad,
NCTensor *get_col_index);
/* add a 'parameter' graph node to 'x' and return 'x'. */
NCTensor *nc_set_param(NCTensor *x, void *opaque);
/* return a new tensor with its graph removed */
NCTensor *nc_stop_grad(NCTensor *x);
/* manipulation of graph nodes */
NCNode *nc_dup_node(const NCNode *n);
void nc_free_node(NCNode *n);
void nc_combine_nodes(NCContext *m, NCNode **tab_op1, int count,
int axis, int elem_size, const size_t *tab_elem_size);
NCNode *nc_concat_node(NCContext *m, NCNode **inputs, int count,
int axis, const size_t *tab_size);
void nc_concat_optimization(NCContext *m, NCNode **concat_nodes, int count);
void nc_node_set_parent(NCNode *n, int arg_index, const NCNode *n1);
void nc_node_set_arg(NCNode *n, int arg_index, const NCTensor *x);
#define NC_BW_KEEP_GRAD_GRAPH (1 << 0)
/* optimize the nc_get_col() gradient */
#define NC_BW_SPARSE_GRAD (1 << 1)
void nc_backward(const NCTensor *x, NCTensor *grad,
NCParamUpdateFunc *param_update_func, int flags);
void nc_dump_graph(NCTensor *x);
/* utilities for function parameters */
typedef struct {
struct list_head link;
NCTensor **pval; /* pointer to the tensor location */
char *name; /* parameter name */
NCTensor *low_part; /* if BF16 parameter, additional 16 bit precision */
NCTensor *saved_grad; /* debug */
/* SGD opt data */
struct SGDOptVarState *sgd_opt;
} NCParam;
typedef struct {
struct list_head param_list;
BOOL add_graph;
} NCParamList;
void nc_param_list_init(NCParamList *pl);
void nc_param_list_set_graph(NCParamList *pl, BOOL add_graph);
NCParam *nc_new_param_str(NCParamList *pl, NCTensor **pval, const char *str);
__attribute__((format(printf, 3, 4))) NCParam *nc_new_param(NCParamList *pl, NCTensor **pval, const char *fmt, ...);
void nc_param_list_end(NCParamList *pl);
NCParam *nc_find_param(NCParamList *pl, const char *name);
size_t nc_get_param_count(NCParamList *pl);
void nc_save_coefs(NCParamList *pl, const char *filename);
void nc_load_coefs(NCParamList *pl, const char *filename);
void nc_save_state(NCParamList *pl, const char *filename);
void nc_load_state(NCParamList *pl, const char *filename);
/* SGD optimizer */
typedef enum {
SGD_OPT_BASIC,
SGD_OPT_ADAM,
SGD_OPT_TEST,
} SGDOptAlgoEnum;
typedef struct {
SGDOptAlgoEnum algo;
union {
struct {
float beta1;
float beta2;
float eps;
float gradient_clip; /* if != 0, per parameter gradient clipping */
} adam;
} u;
float lr;
} SGDOptParams;
NCSGDOptState *nc_sgd_opt_init(NCContext *m, const SGDOptParams *p);
void nc_sgd_opt_end(NCSGDOptState *s);
void sgd_opt_update_var(void *opaque, NCTensor *yg, NCTensor *get_col_index);
/* set the SGD optimizer 's' to all parameters of the model */
void nc_sgd_opt_set_all(NCParamList *param_list, NCSGDOptState *s);
/* set the SGD optimizer 's' to the variable 'x'. Remove it if s = NULL */
void nc_sgd_opt_set(NCParam *x, NCSGDOptState *s);
void nc_sgd_opt_update(NCSGDOptState *s);
/* force the learning rate */
void nc_sgd_opt_set_lr(NCSGDOptState *s, float lr);
float nc_sgd_opt_get_lr(NCSGDOptState *s);
/* for SGD_OPT_TEST */
NCTensor *nc_sgd_opt_get_grad(NCParam *p);
/* misc utilities (to be removed) */
typedef struct {
uint32_t seed;
/* used by Gaussian generator */
int idx;
float y1;
} RNDState;
typedef struct {
uint16_t u16;
} nc_float16_t;
void rnd_init(RNDState *s, uint32_t seed);
uint32_t rnd_unif_u32(RNDState *s);
float rnd_unif(RNDState *s);
void rnd_unif_vec(float *tab, size_t n, float mu, float range,
RNDState *s);
void rnd_unif_mat(float *tab, size_t stride, size_t h, size_t w,
float mu, float sigma, RNDState *s);
float vec_sum_f32(const float *tab, size_t n);
typedef struct {
float val;
uint32_t idx;
} NCTopKEntry;
/* Return the k largest values among prob[0...n_symb-1] such that k is
the largest value such that k <= topk and sum(i=0 .. k - 2,
prob[tab[i]]) < topp.
It is assumed that prob[i] >= 0. The function returns (k, tab,
sum). 'sum' is the sum of the k returned values. 'tab' must be
freed with nc_free(). */
int nc_topk(NCTopKEntry **ptab, double *psum,
const float *prob, size_t n, int topk, float topp);
#endif /* LIBNC_H */

96
gpt2/list.h Normal file
View file

@ -0,0 +1,96 @@
/*
* Linux klist like system
*
* Copyright (c) 2016-2017 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef LIST_H
#define LIST_H
struct list_head {
struct list_head *prev;
struct list_head *next;
};
#define LIST_HEAD_INIT(el) { &(el), &(el) }
/* return the pointer of type 'type *' containing 'el' as field 'member' */
#define list_entry(el, type, member) \
((type *)((uint8_t *)(el) - offsetof(type, member)))
static inline void init_list_head(struct list_head *head)
{
head->prev = head;
head->next = head;
}
/* insert 'el' between 'prev' and 'next' */
static inline void __list_add(struct list_head *el,
struct list_head *prev, struct list_head *next)
{
prev->next = el;
el->prev = prev;
el->next = next;
next->prev = el;
}
/* add 'el' at the head of the list 'head' (= after element head) */
static inline void list_add(struct list_head *el, struct list_head *head)
{
__list_add(el, head, head->next);
}
/* add 'el' at the end of the list 'head' (= before element head) */
static inline void list_add_tail(struct list_head *el, struct list_head *head)
{
__list_add(el, head->prev, head);
}
static inline void list_del(struct list_head *el)
{
struct list_head *prev, *next;
prev = el->prev;
next = el->next;
prev->next = next;
next->prev = prev;
el->prev = NULL; /* fail safe */
el->next = NULL; /* fail safe */
}
static inline int list_empty(struct list_head *el)
{
return el->next == el;
}
#define list_for_each(el, head) \
for(el = (head)->next; el != (head); el = el->next)
#define list_for_each_safe(el, el1, head) \
for(el = (head)->next, el1 = el->next; el != (head); \
el = el1, el1 = el->next)
#define list_for_each_prev(el, head) \
for(el = (head)->prev; el != (head); el = el->prev)
#define list_for_each_prev_safe(el, el1, head) \
for(el = (head)->prev, el1 = el->prev; el != (head); \
el = el1, el1 = el->prev)
#endif /* LIST_H */

86
gpt2/readme.txt Normal file
View file

@ -0,0 +1,86 @@
GPT-2 text completion and compression demo
==========================================
1) Usage
--------
Extract the 117M GPT-2 model to the gpt2tc directory:
tar xtf gpt2tc-117M.tar.gz
Text completion example:
./gpt2tc g "Hello, my name is"
Use more CPU cores (only faster on server CPUs):
./gpt2tc -T 4 g "Hello, my name is"
Short Text compression and decompression example:
./gpt2tc cs "Hello, how are you ?"
./gpt2tc ds "姯敳痪"
Text compression example:
./gpt2tc c in.txt out.bin
Decompression:
./gpt2tc d out.bin out.txt
2) Using larger models
----------------------
The smallest GPT-2 model (117M) is provided in a separate
archive. Larger models can be built by downloading the TensorFlow
parameters and converting them with the attached script. Example:
# download the model to models/345M
./download_model.sh 345M
# convert it to the gpt2tc format:
python3 gpt2convert.py models/345M gpt2_345M.bin
# use it
./gpt2tc -m 345M g "Hello, how are you ?"
3) Compression results
----------------------
File Model Original size Compr. size Ratio CMIX v18
#params (bytes) (bytes) (bpb) ratio (bpb)
book1 117M 768771 152283 1.58 1.82
book1 345M 768771 142183 1.48
book1 774M 768771 137562 1.43
book1 1558M 768771 134217 1.40
alice29.txt 117M 152089 23615 1.24 1.65
alice29.txt 345M 152089 20587 1.08
alice29.txt 774M 152089 19096 1.00
alice29.txt 1558M 152089 17382 0.91
enwik5 117M 100000 14875 1.19 1.60
enwik5 345M 100000 13511 1.08
enwik5 774M 100000 13240 1.06
enwik5 1558M 100000 12918 1.03
Notes:
- book1 comes from the Calgary corpus.
- alice29.txt comes from the Canterbury corpus.
- enwik5 contains the first 100000 bytes of the English
Wikipedia dump of March 3, 2006
(http://mattmahoney.net/dc/textdata.html).
- For best performance, use the UTF-8 encoding and don't mix CRLF and
LF line breaks.
- For reference, the results of CMIX
(http://www.byronknoll.com/cmix.html) are provided.
4) More information
-------------------
This demo has no external dependency. It is written in C and uses the
LibNC library for tensor manipulation. The CPU must support AVX2.
A similar program is used for http://textsynth.org/

54
justlm.hpp Normal file
View file

@ -0,0 +1,54 @@
#ifndef LLM_H
#define LLM_H
#include <iostream>
#include <string>
#include <vector>
#include <functional>
#include <memory>
#include <thread>
class LLM {
struct {
int32_t seed; // RNG seed
int32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 2;
union {
int32_t n_ctx; // Context size, llama.cpp specific
int32_t n_prompt = -1; // Prompt size, gpt2 specific
};
int32_t n_batch = 8; // Batch size, unused
int32_t top_k = 40;
float top_p = 0.5f;
float temp = 0.72f;
} params;
struct State *state;
void init(const std::string& weights_path);
static
bool ends_with(std::string_view str, std::string_view suffix);
public:
struct Exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
struct ContextLengthException : public Exception {
ContextLengthException() : Exception("Max. context length exceeded") {}
};
LLM(const std::string& weights_path, int32_t seed = 0) {
// Set random seed
params.seed = seed?seed:time(NULL);
// Initialize llm
init(weights_path);
}
~LLM();
void append(std::string_view prompt, const std::function<bool (float progress)>& on_tick = nullptr);
std::string run(std::string_view end, const std::function<bool (const char *generated)>& on_tick = nullptr);
};
#endif // LLM_H

9
libjustlm_core.cpp Normal file
View file

@ -0,0 +1,9 @@
#include "justlm.hpp"
#include <string_view>
bool LLM::ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() && 0 == str.compare(str.size()-suffix.size(), suffix.size(), suffix);
}

80
libjustlm_gpt2.cpp Normal file
View file

@ -0,0 +1,80 @@
#include "justlm.hpp"
#include "gpt2/gpt2tc.h"
#include <filesystem>
#include <cstring>
struct State {
std::string prompt;
std::string model_path;
GPT2ModelEnum model;
} state;
void LLM::init(const std::string& weights_path) {
state->model_path = weights_path;
// Get weight file size
auto weights_size = std::filesystem::file_size(weights_path);
// Determine weight size
switch (weights_size) {
case 250700242: state->model = GPT2_MODEL_117M; break;
case 3120522738: state->model = GPT2_MODEL_1558M; break;
case 712396722: state->model = GPT2_MODEL_345M; break;
case 1551900050: state->model = GPT2_MODEL_774M; break;
default: throw Exception("Unknown model size");
}
}
LLM::~LLM() {
delete state;
}
void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
state->prompt.append(prompt);
std::cout << prompt << std::endl;
}
std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
std::string fres;
TextCompleteGlobalState *tcs;
TextGenContext *ts;
int count;
struct timeval tv;
struct list_head ts_list;
// Initialize completion
tcs = text_complete_global_init(state->model, state->model_path.c_str());
// Run completion
ts = text_complete_start(tcs, state->prompt.c_str(), params.top_k, params.top_p, params.temp,
params.seed, params.n_prompt>0?params.n_prompt:0xfffffff - state->prompt.size());
bool abort = false;
while (!abort && !ends_with(fres, end)) {
// Run completion
init_list_head(&ts_list);
list_add_tail(&ts->link, &ts_list);
text_complete_next(tcs, &ts_list);
if (ts->out_text_len == 0)
break;
auto str = std::string_view{ts->out_text, static_cast<std::string_view::size_type>(ts->out_text_len)};
// Append result to fres
fres.append(str);
// Tick
if (on_tick && !on_tick(std::string(str).c_str()) /*Huge overhead in favor of llama.cpp*/) abort = true;
}
// End completion
text_complete_end(ts);
text_complete_global_end(tcs);
// Create final string TODO: Could be optimized
state->prompt.append(fres);
fres = std::string(fres.data(), fres.size()-end.size());
// Return final string
return fres;
}

115
libjustlm_llama.cpp Normal file
View file

@ -0,0 +1,115 @@
#include "justlm.hpp"
#include <ggml.h>
#include <llama.h>
struct State {
llama_context *ctx = nullptr;
std::string prompt;
std::vector<int> embd;
int n_ctx;
std::string last_result;
} state;
void LLM::init(const std::string& weights_path) {
// Allocate state
state = new State;
// Get llama parameters
auto lparams = llama_context_default_params();
lparams.seed = params.seed;
lparams.n_ctx = params.n_ctx>0?params.n_ctx:2024;
// Create context
state->ctx = llama_init_from_file(weights_path.c_str(), lparams);
if (!state->ctx) {
throw Exception("Failed to initialize llama from file");
}
// Initialize some variables
state->n_ctx = llama_n_ctx(state->ctx);
}
LLM::~LLM() {
if (state->ctx) llama_free(state->ctx);
delete state;
}
void LLM::append(std::string_view prompt, const std::function<bool (float)> &on_tick) {
// Check if prompt was empty
const bool was_empty = state->prompt.empty();
// Append to current prompt
state->prompt.append(prompt);
// Resize buffer for tokens
const auto old_token_count = state->embd.size();
state->embd.resize(old_token_count+state->prompt.size()+1);
// Run tokenizer
const auto token_count = llama_tokenize(state->ctx, prompt.data(), state->embd.data()+old_token_count, state->embd.size()-old_token_count, was_empty);
state->embd.resize(old_token_count+token_count);
// Make sure limit is far from being hit
if (state->embd.size() > state->n_ctx-6) {
// Yup. *this MUST be decomposed now.
throw ContextLengthException();
}
// Evaluate new tokens
// TODO: Larger batch size
std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state->embd.size() << '/' << state->n_ctx << std::endl;
for (int it = old_token_count; it != state->embd.size(); it++) {
std::cout << llama_token_to_str(state->ctx, state->embd.data()[it]) << std::flush;
llama_eval(state->ctx, state->embd.data()+it, 1, it, params.n_threads);
// Tick
if (on_tick) {
// Calculate progress
auto progress = float(it-old_token_count) / (state->embd.size()-old_token_count) * 100.f;
// Run callback
if (!on_tick(progress)) break;
}
}
std::cout << std::endl;
}
std::string LLM::run(std::string_view end, const std::function<bool (const char *)> &on_tick) {
std::string fres;
// Loop until done
bool abort = false;
while (!abort && !ends_with(fres, end)) {
// Sample top p and top k
const auto id = llama_sample_top_p_top_k(state->ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);
// Add token
state->embd.push_back(id);
// Get token as string
const auto str = llama_token_to_str(state->ctx, id);
// Debug
std::cout << str << std::flush;
// Append string to function result
fres.append(str);
// Evaluate token
// TODO: Respect batch size
llama_eval(state->ctx, state->embd.data()+state->embd.size()-1, 1, state->embd.size()-1, params.n_threads);
// Tick
if (on_tick && !on_tick(str)) abort = true;
}
// Create final string TODO: Could be optimized
state->prompt.append(fres);
fres = std::string(fres.data(), fres.size()-end.size());
// Return final string
return fres;
}

1
llama.cpp Submodule

@ -0,0 +1 @@
Subproject commit 9cbc404ba6699a9ba4925ea25a60552b13491c7a

12
test.cpp Normal file
View file

@ -0,0 +1,12 @@
#include "ai.hpp"
#include <iostream>
int main() {
Ai ai;
std::cout << "Completing \"she replied that\"..." << std::endl;
std::cout << "Using model " << ai.model_name << "..." << std::endl;
std::cout << "> she replied that" << ai.complete("she replied that", '\n') << std::endl;
}