1
0
Fork 0
mirror of https://gitlab.com/niansa/libjustlm.git synced 2025-03-06 20:49:17 +01:00
libjustlm/gpt2/libnc.h
2023-03-30 07:03:33 -05:00

426 lines
15 KiB
C

/*
* LibNC
*
* Copyright (c) 2018-2019 Fabrice Bellard
*
*/
#ifndef LIBNC_H
#define LIBNC_H
#include <inttypes.h>
#include "cutils.h"
#include "list.h"
/* profiling */
typedef enum {
PROF_EVAL,
PROF_GRAD,
PROF_SGD,
PROF_UPDATE,
PROF_WRITE_SYM,
PROF_PROBE,
PROF_TOTAL,
PROF_COUNT,
} ProfEnum;
#ifdef PROFILE
extern int64_t prof_cycles[PROF_COUNT];
extern int64_t prof_samples[PROF_COUNT];
extern int64_t prof_ops[PROF_COUNT];
static inline void prof_start(int idx)
{
prof_cycles[idx] -= get_cycles();
}
static inline void prof_end(int idx)
{
prof_cycles[idx] += get_cycles();
prof_samples[idx]++;
}
static inline void prof_end_ops(int idx, int n_ops)
{
prof_cycles[idx] += get_cycles();
prof_ops[idx] += n_ops;
prof_samples[idx]++;
}
#else
static inline void prof_start(int idx)
{
}
static inline void prof_end(int idx)
{
}
static inline void prof_end_ops(int idx, int n_ops)
{
}
#endif
void nc_prof_dump(void);
/* Automatic Differentiation Engine */
typedef struct NCContext NCContext;
typedef struct NCDevice NCDevice;
typedef struct NCTensor NCTensor;
typedef struct NCTensorBuffer NCTensorBuffer;
typedef struct NCNode NCNode;
typedef struct NCRNDState NCRNDState;
typedef struct NCSGDOptState NCSGDOptState;
typedef enum {
NC_TYPE_F32,
NC_TYPE_BF16,
NC_TYPE_F16,
NC_TYPE_I8,
NC_TYPE_I16,
NC_TYPE_I32,
NC_TYPE_COUNT,
} NCTypeEnum;
extern size_t nc_type_size_table[NC_TYPE_COUNT];
extern const char *nc_type_name_table[NC_TYPE_COUNT];
#define NC_N_DIMS_MAX 4 /* maximum number of axis for tensors */
typedef struct NCTensorData {
NCTypeEnum item_type;
size_t item_size;
void *data;
size_t stride; /* in elements */
size_t n_strides; /* prod(j = 1 ... n_dims, dims[j]); */
int n_dims;
const size_t *dims; /* n_dims length */
const size_t *strides; /* n_dims length, strides in bytes */
} NCTensorData;
void *nc_malloc(size_t size);
void *nc_mallocz(size_t size);
void nc_free(void *ptr);
NCContext *nc_context_init(int nb_threads);
void nc_context_end(NCContext *m);
NCDevice *nc_new_cpu_device(NCContext *m);
NCDevice *nc_new_cuda_device(NCContext *m, int device_index);
NCDevice *nc_new_device(NCContext *m, const char *device_name);
void nc_synchronize(NCDevice *d);
NCTensorBuffer *nc_new_tensor_buffer(NCDevice *d, size_t size);
NCTensorBuffer *nc_dup_tensor_buffer(const NCTensorBuffer *b);
void nc_free_tensor_buffer(NCTensorBuffer *b);
NCTensor *nc_new_tensor(NCDevice *d, NCTypeEnum type,
int n_dims, const size_t *dims);
NCTensor *nc_new_tensor_from_tensor(const NCTensor *x);
NCTensor *nc_new_tensor_from_tensor_nz(const NCTensor *x);
NCTensor *nc_new_scalar(NCDevice *d, NCTypeEnum type);
NCTensor *nc_new_tensor_1d(NCDevice *d, NCTypeEnum type, size_t len);
NCTensor *nc_new_tensor_2d(NCDevice *d, NCTypeEnum type, size_t n0, size_t n1);
NCTensor *nc_new_tensor_3d(NCDevice *d, NCTypeEnum type,
size_t n0, size_t n1, size_t n2);
NCTensor *nc_new_tensor_4d(NCDevice *d, NCTypeEnum type,
size_t n0, size_t n1, size_t n2, size_t n3);
NCTensor *__attribute__((format(printf, 2, 3))) nc_tensor_set_name(NCTensor *x, const char *fmt, ...);
NCTensor *nc_dup_tensor(const NCTensor *x);
void nc_free_tensor(NCTensor *x);
void nc_dump_tensor(const char *name, NCTensor *x, size_t n);
uint32_t nc_tensor_get_hash(NCTensor *x);
void nc_dump_tensor_hash(const char *name, const NCTensor *x);
NCNode *nc_get_node(NCTensor *x);
/* create an alias to tensor 'x1'. Gradient is not propagated thru it */
NCTensor *nc_slice_alias(const NCTensor *x1, int axis, size_t start, size_t end);
NCTypeEnum nc_tensor_get_item_type(const NCTensor *x);
NCTensorData *nc_tensor_get_data(NCTensorData *sd, const NCTensor *x);
/* Return a pointer to the tensor data. If *pstride is non NULL,
return the stride (in elements) of the first dimension. */
void *nc_tensor_get_ptr(NCTensor *x, size_t *pstride);
const size_t *nc_tensor_get_dims(const NCTensor *x, int *pn_dims);
void nc_tensor_set_zero(NCTensor *y);
void nc_tensor_set_f32(NCTensor *y, float val);
NCRNDState *nc_rnd_init(NCDevice *d, uint32_t seed);
void nc_rnd_end(NCRNDState *s);
void nc_tensor_set_rnd_unif(NCTensor *y, float avg, float range,
NCRNDState *rnd_state);
void nc_tensor_set_dropout(NCTensor *y, float prob, NCRNDState *rnd_state);
void nc_set1_i32(NCTensor *y, int n_dims, const size_t *tab_indexes,
int32_t val);
void nc_set1_i32_1d(NCTensor *y, size_t i0, int32_t val);
void nc_set1_i32_2d(NCTensor *y, size_t i0, size_t i1, int32_t val);
void nc_set1_f32(NCTensor *y, int n_dims, const size_t *tab_indexes,
float val);
void nc_set1_f32_1d(NCTensor *y, size_t i0, float val);
int32_t nc_get1_i32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
float nc_get1_f32(const NCTensor *x, int n_dims, const size_t *tab_indexes);
float nc_get1_f32_1d(const NCTensor *x, size_t i0);
float nc_get_scalar_f32(const NCTensor *x);
void nc_tensor_copy(NCTensor *dst, NCTensor *src);
void nc_tensor_convert(NCTensor *dst, NCTensor *src);
void nc_dump_dims(const char *str, NCTensor *x);
size_t nc_get_heap_size(NCContext *m);
NCContext *nc_get_tensor_context(const NCTensor *x);
NCTensor *nc_tensor_to_device(NCTensor *x, NCDevice *d);
NCTensor *nc_tensor_to_cpu_device(NCTensor *x);
NCDevice *nc_get_tensor_device(const NCTensor *x);
/* element wise operations */
NCTensor *nc_convert(NCTensor *x, NCTypeEnum new_type);
NCTensor *nc_add(NCTensor *x1, NCTensor *x2);
NCTensor *nc_neg(NCTensor *x);
NCTensor *nc_sub(NCTensor *x1, NCTensor *x2);
NCTensor *nc_mul(NCTensor *x1, NCTensor *x2);
NCTensor *nc_div(NCTensor *x1, NCTensor *x2);
NCTensor *nc_recip(NCTensor *x);
NCTensor *nc_min(NCTensor *x1, NCTensor *x2);
NCTensor *nc_max(NCTensor *x1, NCTensor *x2);
/* select x1[i] if z[i] = 0 and x2[i] otherwise */
NCTensor *nc_select(NCTensor *z, NCTensor *x1, NCTensor *x2);
/* set y[i] = x1[i] if mask[i] = 0 and y[i] = c if mask[i] != 0. If
mask_inv is TRUE, 'mask' is inverted */
NCTensor *nc_masked_fill(NCTensor *x, NCTensor *mask, float c, BOOL mask_inv);
NCTensor *nc_sigmoid(NCTensor *x);
NCTensor *nc_tanh(NCTensor *x);
NCTensor *nc_relu(NCTensor *x);
NCTensor *nc_gelu(NCTensor *x);
NCTensor *nc_log(NCTensor *x);
/* return cp * fg + min(1 - fg, ig) * in */
NCTensor *nc_lstm_clamped(NCTensor *cp, NCTensor *in,
NCTensor *fg, NCTensor *ig);
/* return a * (1 - t) + b * t */
NCTensor *nc_lerp(NCTensor *a, NCTensor *b, NCTensor *t);
/* other operations */
NCTensor *nc_new_vec_f32(NCDevice *d, size_t n, float val);
NCTensor *nc_new_f32(NCDevice *d, float val);
NCTensor *nc_reshape(NCTensor *x, int n_dims, const size_t *dims);
NCTensor *nc_reshape_1d(NCTensor *x, size_t n0);
NCTensor *nc_reshape_2d(NCTensor *x, size_t n0, size_t n1);
NCTensor *nc_reshape_3d(NCTensor *x, size_t n0, size_t n1, size_t n2);
NCTensor *nc_reshape_4d(NCTensor *x, size_t n0, size_t n1, size_t n2,
size_t n3);
/* duplicate the tensor by adding n_dims dimensions */
NCTensor *nc_repeat(NCTensor *x, int n_dims, const size_t *dims);
NCTensor *nc_repeat_1d(NCTensor *x, size_t n);
/* return y0 + sum over the dimensions > n_dims of 'x'. y0 = NULL
is supported */
NCTensor *nc_reduce_sum(NCTensor *y0, NCTensor *x, int n_dims);
/* sum all the elements of a tensor */
NCTensor *nc_sum(NCTensor *x);
/* sum of squares */
NCTensor *nc_reduce_sum_sqr(NCTensor *x);
NCTensor *nc_slice(NCTensor *x, int axis, size_t start, size_t end);
NCTensor *nc_slice_add(NCTensor *y0, NCTensor *x, int axis, size_t start);
/* concatenation along axis 'axis' */
NCTensor *nc_concat(NCTensor **inputs, int n_inputs, int axis);
/* shortcut for axis = 0 */
NCTensor *nc_vconcat(NCTensor **inputs, int n_inputs);
/* shortcut for axis = 1 */
NCTensor *nc_hconcat(NCTensor **inputs, int n_inputs);
/* split along axis 'axis'. If tab_size = NULL, split equally. */
void nc_split(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size, int axis);
/* shortcut for axis = 0 */
void nc_vsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size);
/* shortcut for axis = 1 */
void nc_hsplit(NCTensor **tab_y, NCTensor *x, int n_outputs,
const size_t *tab_size);
typedef enum {
NC_PAD_ZERO,
NC_PAD_DUP, /* duplicate element */
/* trim types, dual to padding */
NC_TRIM_NORMAL = NC_PAD_ZERO,
NC_TRIM_SUM, /* add trimmed elements to the edge */
} NCPadEnum;
/* pad (len > 0) or trim (len < 0) the axis 0 of 'x' */
NCTensor *nc_pad(NCTensor *x, ssize_t left_len, NCPadEnum left_op,
ssize_t right_len, NCPadEnum right_op);
/* shortcut to nc_pad() */
NCTensor *nc_resize(NCTensor *x, size_t n);
/* if x is not contiguous then create a new contiguous tensor and copy
x to it. Otherwise, return 'x'. */
NCTensor *nc_make_contiguous(NCTensor *x);
/* Return a new tensor sharing the same buffer as 'x' with the permuted
dimensions. axis[i] is the corresponding axis in 'x' */
NCTensor *nc_permute_alias(NCTensor *x, int n_dims, const int *axis);
/* same as nc_permute_alias but calls nc_make_contiguous after. */
NCTensor *nc_permute(NCTensor *x, int n_dims, const int *axis);
/* special case of nc_permute() */
NCTensor *nc_transpose(NCTensor *x);
NCTensor *nc_matmul(NCTensor *w, NCTensor *x);
/* return w*x + y0. w and x can be optionally transposed. y0 can be NULL */
NCTensor *nc_matmul_add(NCTensor *w, NCTensor *x, NCTensor *y0,
BOOL w_trans, BOOL x_trans);
NCTensor *nc_matmul_stride(NCTensor *w, NCTensor *x);
/* return a matrix where each column is the column x[i] of matrix 'w' */
NCTensor *nc_get_col(NCTensor *w, NCTensor *x);
/* add the vectors 'z' at column number 'x' in matrix 'w'. */
NCTensor *nc_add_col(NCTensor *z, NCTensor *x, NCTensor *w);
/* select the x-th element in each column of 'w' */
NCTensor *nc_get_element(NCTensor *w, NCTensor *x);
/* add z to the x-th element in each column of 'w' */
NCTensor *nc_add_element(NCTensor *z, NCTensor *x, NCTensor *w);
NCTensor *nc_soft_max(NCTensor *x);
/* Equivalent to y = log(get_element(x, eout)). It is expected to be
used as nc_index_log(nc_soft_max(x), eout) so that the gradient
computation is optimized. */
NCTensor *nc_indexed_log(NCTensor *x, NCTensor *eout);
NCTensor *nc_layer_norm(NCTensor *x, float eps);
NCTensor *nc_rms_norm(NCTensor *x, float eps);
NCTensor *nc_slt_mat_set(NCTensor *x, size_t pos, float c);
/* shift the column 'i' by 'pos + i * mult' elements and pad with with zeros */
NCTensor *nc_rel_shift(NCTensor *x, ssize_t pos, ssize_t mult);
/* auto differentiation */
/* get_col_index is non NULL in the sparse gradient case */
typedef void NCParamUpdateFunc(void *opaque, NCTensor *grad,
NCTensor *get_col_index);
/* add a 'parameter' graph node to 'x' and return 'x'. */
NCTensor *nc_set_param(NCTensor *x, void *opaque);
/* return a new tensor with its graph removed */
NCTensor *nc_stop_grad(NCTensor *x);
/* manipulation of graph nodes */
NCNode *nc_dup_node(const NCNode *n);
void nc_free_node(NCNode *n);
void nc_combine_nodes(NCContext *m, NCNode **tab_op1, int count,
int axis, int elem_size, const size_t *tab_elem_size);
NCNode *nc_concat_node(NCContext *m, NCNode **inputs, int count,
int axis, const size_t *tab_size);
void nc_concat_optimization(NCContext *m, NCNode **concat_nodes, int count);
void nc_node_set_parent(NCNode *n, int arg_index, const NCNode *n1);
void nc_node_set_arg(NCNode *n, int arg_index, const NCTensor *x);
#define NC_BW_KEEP_GRAD_GRAPH (1 << 0)
/* optimize the nc_get_col() gradient */
#define NC_BW_SPARSE_GRAD (1 << 1)
void nc_backward(const NCTensor *x, NCTensor *grad,
NCParamUpdateFunc *param_update_func, int flags);
void nc_dump_graph(NCTensor *x);
/* utilities for function parameters */
typedef struct {
struct list_head link;
NCTensor **pval; /* pointer to the tensor location */
char *name; /* parameter name */
NCTensor *low_part; /* if BF16 parameter, additional 16 bit precision */
NCTensor *saved_grad; /* debug */
/* SGD opt data */
struct SGDOptVarState *sgd_opt;
} NCParam;
typedef struct {
struct list_head param_list;
BOOL add_graph;
} NCParamList;
void nc_param_list_init(NCParamList *pl);
void nc_param_list_set_graph(NCParamList *pl, BOOL add_graph);
NCParam *nc_new_param_str(NCParamList *pl, NCTensor **pval, const char *str);
__attribute__((format(printf, 3, 4))) NCParam *nc_new_param(NCParamList *pl, NCTensor **pval, const char *fmt, ...);
void nc_param_list_end(NCParamList *pl);
NCParam *nc_find_param(NCParamList *pl, const char *name);
size_t nc_get_param_count(NCParamList *pl);
void nc_save_coefs(NCParamList *pl, const char *filename);
void nc_load_coefs(NCParamList *pl, const char *filename);
void nc_save_state(NCParamList *pl, const char *filename);
void nc_load_state(NCParamList *pl, const char *filename);
/* SGD optimizer */
typedef enum {
SGD_OPT_BASIC,
SGD_OPT_ADAM,
SGD_OPT_TEST,
} SGDOptAlgoEnum;
typedef struct {
SGDOptAlgoEnum algo;
union {
struct {
float beta1;
float beta2;
float eps;
float gradient_clip; /* if != 0, per parameter gradient clipping */
} adam;
} u;
float lr;
} SGDOptParams;
NCSGDOptState *nc_sgd_opt_init(NCContext *m, const SGDOptParams *p);
void nc_sgd_opt_end(NCSGDOptState *s);
void sgd_opt_update_var(void *opaque, NCTensor *yg, NCTensor *get_col_index);
/* set the SGD optimizer 's' to all parameters of the model */
void nc_sgd_opt_set_all(NCParamList *param_list, NCSGDOptState *s);
/* set the SGD optimizer 's' to the variable 'x'. Remove it if s = NULL */
void nc_sgd_opt_set(NCParam *x, NCSGDOptState *s);
void nc_sgd_opt_update(NCSGDOptState *s);
/* force the learning rate */
void nc_sgd_opt_set_lr(NCSGDOptState *s, float lr);
float nc_sgd_opt_get_lr(NCSGDOptState *s);
/* for SGD_OPT_TEST */
NCTensor *nc_sgd_opt_get_grad(NCParam *p);
/* misc utilities (to be removed) */
typedef struct {
uint32_t seed;
/* used by Gaussian generator */
int idx;
float y1;
} RNDState;
typedef struct {
uint16_t u16;
} nc_float16_t;
void rnd_init(RNDState *s, uint32_t seed);
uint32_t rnd_unif_u32(RNDState *s);
float rnd_unif(RNDState *s);
void rnd_unif_vec(float *tab, size_t n, float mu, float range,
RNDState *s);
void rnd_unif_mat(float *tab, size_t stride, size_t h, size_t w,
float mu, float sigma, RNDState *s);
float vec_sum_f32(const float *tab, size_t n);
typedef struct {
float val;
uint32_t idx;
} NCTopKEntry;
/* Return the k largest values among prob[0...n_symb-1] such that k is
the largest value such that k <= topk and sum(i=0 .. k - 2,
prob[tab[i]]) < topp.
It is assumed that prob[i] >= 0. The function returns (k, tab,
sum). 'sum' is the sum of the k returned values. 'tab' must be
freed with nc_free(). */
int nc_topk(NCTopKEntry **ptab, double *psum,
const float *prob, size_t n, int topk, float topp);
#endif /* LIBNC_H */