blob: 3dfe9269780c9444a7fb6d0d2493746f6b9c2fdd [file] [log] [blame]
/*
* Copyright (C) Igor Sysoev
* Copyright (C) NGINX, Inc.
*/
#ifndef _NJS_STRING_H_INCLUDED_
#define _NJS_STRING_H_INCLUDED_
/*
* nJSVM supports two string variants:
*
* 1) short strings which size is less than or equal to 14 (NJS_STRING_SHORT)
* bytes, these strings are stored inside njs_value_t (see njs_vm.h for
* details);
*
* 2) and long strings using additional njs_string_t structure.
* This structure has the start field to support external strings.
* The long strings can have optional UTF-8 offset map.
*
* The number of the string variants is limited to 2 variants to minimize
* overhead of processing string fields.
*/
/* The maximum signed int32_t. */
#define NJS_STRING_MAX_LENGTH 0x7fffffff
/*
* NJS_STRING_MAP_STRIDE should be power of two to use shift and binary
* AND operations instead of division and remainder operations but no
* less than 16 because the maximum length of short string inlined in
* njs_value_t is less than 16 bytes.
*/
#define NJS_STRING_MAP_STRIDE 32
#define njs_string_map_offset(size) njs_align_size((size), sizeof(uint32_t))
#define njs_string_map_start(p) \
((uint32_t *) njs_align_ptr((p), sizeof(uint32_t)))
#define njs_string_map_size(length) \
(((length - 1) / NJS_STRING_MAP_STRIDE) * sizeof(uint32_t))
/*
* ECMAScript strings are stored in UTF-16. nJSVM however, allows to store
* any byte sequences in strings. A size of string in bytes is stored in the
* size field. If byte sequence is valid UTF-8 string then its length is
* stored in the UTF-8 length field. Otherwise, the length field is zero.
* If a string is UTF-8 string then string functions use UTF-8 characters
* positions and lengths. Otherwise they use with byte positions and lengths.
* Using UTF-8 encoding does not allow to get quickly a character at specified
* position. To speed up this search a map of offsets is stored after the
* UTF-8 string. The map is aligned to uint32_t and contains byte positions
* of each NJS_STRING_MAP_STRIDE UTF-8 character except zero position. The
* map can be initialized on demand. Unitialized map is marked with zero
* value in the first map element. If string comes outside JavaScript as
* byte string just to be concatenated or to match regular expressions the
* offset map is not required.
*
* The map is not allocated:
* 1) if string length is zero hence string is a byte string;
* 2) if string size and length are equal so the string contains only
* ASCII characters and map is not required;
* 3) if string length is less than NJS_STRING_MAP_STRIDE.
*
* The current implementation does not support Unicode surrogate pairs.
* It can be implemented later if it will be required using the following
* algorithm: if offset in map points to surrogate pair then the previous
* offset should be used and so on until start of the string.
*/
struct njs_string_s {
u_char *start;
uint32_t length; /* Length in UTF-8 characters. */
uint32_t retain; /* Link counter. */
};
typedef struct {
size_t size;
size_t length;
u_char *start;
} njs_string_prop_t;
typedef struct {
size_t start;
size_t length;
size_t string_length;
} njs_slice_prop_t;
typedef enum {
NJS_STRING_BYTE = 0,
NJS_STRING_ASCII,
NJS_STRING_UTF8,
} njs_utf8_t;
njs_inline njs_bool_t
njs_is_byte_string(njs_string_prop_t *string)
{
return (string->length == 0 && string->size != 0);
}
njs_inline njs_bool_t
njs_is_byte_or_ascii_string(njs_string_prop_t *string)
{
return (string->length == 0 || string->length == string->size);
}
njs_inline uint32_t
njs_string_calc_length(njs_utf8_t utf8, const u_char *start, size_t size)
{
ssize_t length;
switch (utf8) {
case NJS_STRING_BYTE:
return 0;
case NJS_STRING_ASCII:
return size;
case NJS_STRING_UTF8:
default:
length = njs_utf8_length(start, size);
return (length >= 0) ? length : 0;
}
}
njs_inline uint32_t
njs_string_length(njs_value_t *string)
{
uint32_t length, size;
if (string->short_string.size != NJS_STRING_LONG) {
size = string->short_string.size;
length = string->short_string.length;
} else {
size = string->long_string.size;
length = string->long_string.data->length;
}
return (length == 0) ? size : length;
}
njs_inline njs_bool_t
njs_need_escape(const uint32_t *escape, uint32_t byte)
{
return ((escape[byte >> 5] & ((uint32_t) 1 << (byte & 0x1f))) != 0);
}
njs_inline u_char *
njs_string_encode(const uint32_t *escape, size_t size, const u_char *src,
u_char *dst)
{
uint8_t byte;
static const u_char hex[16] = "0123456789ABCDEF";
do {
byte = *src++;
if (njs_need_escape(escape, byte)) {
*dst++ = '%';
*dst++ = hex[byte >> 4];
*dst++ = hex[byte & 0xf];
} else {
*dst++ = byte;
}
size--;
} while (size != 0);
return dst;
}
njs_int_t njs_string_set(njs_vm_t *vm, njs_value_t *value, const u_char *start,
uint32_t size);
u_char *njs_string_alloc(njs_vm_t *vm, njs_value_t *value, uint64_t size,
uint64_t length);
njs_int_t njs_string_new(njs_vm_t *vm, njs_value_t *value, const u_char *start,
uint32_t size, uint32_t length);
njs_int_t njs_string_create(njs_vm_t *vm, njs_value_t *value, const char *src,
size_t size);
void njs_encode_hex(njs_str_t *dst, const njs_str_t *src);
size_t njs_encode_hex_length(const njs_str_t *src, size_t *out_size);
void njs_encode_base64(njs_str_t *dst, const njs_str_t *src);
size_t njs_encode_base64_length(const njs_str_t *src, size_t *out_size);
void njs_decode_utf8(njs_str_t *dst, const njs_str_t *src);
size_t njs_decode_utf8_length(const njs_str_t *src, size_t *out_size);
void njs_decode_hex(njs_str_t *dst, const njs_str_t *src);
size_t njs_decode_hex_length(const njs_str_t *src, size_t *out_size);
void njs_decode_base64(njs_str_t *dst, const njs_str_t *src);
size_t njs_decode_base64_length(const njs_str_t *src, size_t *out_size);
void njs_decode_base64url(njs_str_t *dst, const njs_str_t *src);
size_t njs_decode_base64url_length(const njs_str_t *src, size_t *out_size);
njs_int_t njs_string_hex(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_base64(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_base64url(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_decode_utf8(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_decode_hex(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_decode_base64(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
njs_int_t njs_string_decode_base64url(njs_vm_t *vm, njs_value_t *value,
const njs_str_t *src);
void njs_string_truncate(njs_value_t *value, uint32_t size, uint32_t length);
void njs_string_copy(njs_value_t *dst, njs_value_t *src);
njs_int_t njs_string_validate(njs_vm_t *vm, njs_string_prop_t *string,
njs_value_t *value);
size_t njs_string_prop(njs_string_prop_t *string, const njs_value_t *value);
njs_int_t njs_string_cmp(const njs_value_t *val1, const njs_value_t *val2);
void njs_string_slice_string_prop(njs_string_prop_t *dst,
const njs_string_prop_t *string, const njs_slice_prop_t *slice);
njs_int_t njs_string_slice(njs_vm_t *vm, njs_value_t *dst,
const njs_string_prop_t *string, const njs_slice_prop_t *slice);
const u_char *njs_string_offset(const u_char *start, const u_char *end,
size_t index);
uint32_t njs_string_index(njs_string_prop_t *string, uint32_t offset);
void njs_string_offset_map_init(const u_char *start, size_t size);
double njs_string_to_index(const njs_value_t *value);
const char *njs_string_to_c_string(njs_vm_t *vm, njs_value_t *value);
njs_int_t njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t component);
njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t component);
njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused);
njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
njs_utf8_t utf8, const u_char *start, size_t size);
njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched,
njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures,
njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval);
extern const njs_object_init_t njs_string_instance_init;
extern const njs_object_type_init_t njs_string_type_init;
#endif /* _NJS_STRING_H_INCLUDED_ */