blob: bce8be84b6255b1f567552f47c975e504ede2abc [file] [log] [blame]
/*
* Copyright (C) Igor Sysoev
* Copyright (C) NGINX, Inc.
*/
#include <njs_main.h>
/*
* The njs_unicode_lower_case.h and njs_unicode_upper_case.h files are
* auto-generated from the UnicodeData.txt file version 14.0.0 (May 2021)
* provided by Unicode, Inc.:
*
* ./njs_unicode_lower_case.pl UnicodeData.txt
* ./njs_unicode_upper_case.pl UnicodeData.txt
*
* Only common and simple case foldings are supported. Full case foldings
* are not supported. Combined characters are also not supported.
*/
#include <njs_unicode_lower_case.h>
#include <njs_unicode_upper_case.h>
u_char *
njs_utf8_encode(u_char *p, uint32_t u)
{
if (u < 0x80) {
*p++ = (u_char) (u & 0xFF);
return p;
}
if (u < 0x0800) {
*p++ = (u_char) (( u >> 6) | 0xC0);
*p++ = (u_char) (( u & 0x3F) | 0x80);
return p;
}
if (u < 0x10000) {
*p++ = (u_char) ( (u >> 12) | 0xE0);
*p++ = (u_char) (((u >> 6) & 0x3F) | 0x80);
*p++ = (u_char) (( u & 0x3F) | 0x80);
return p;
}
if (u < 0x110000) {
*p++ = (u_char) ( (u >> 18) | 0xF0);
*p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
*p++ = (u_char) (((u >> 6) & 0x3F) | 0x80);
*p++ = (u_char) (( u & 0x3F) | 0x80);
return p;
}
return NULL;
}
njs_inline njs_int_t
njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data,
unsigned *need, u_char lower, u_char upper)
{
u_char ch;
ch = **data;
if (ch < lower || ch > upper) {
return NJS_ERROR;
}
(*data)++;
(*need)--;
ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
return NJS_OK;
}
njs_inline void
njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch,
u_char first, u_char second, u_char lower, u_char upper)
{
if (ch == first) {
ctx->lower = lower;
ctx->upper = 0xBF;
} else if (ch == second) {
ctx->lower = 0x80;
ctx->upper = upper;
}
}
uint32_t
njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start,
const u_char *end)
{
u_char c;
unsigned need;
njs_int_t ret;
const u_char *p;
if (ctx->need != 0) {
need = ctx->need;
ctx->need = 0;
if (ctx->lower != 0x00) {
ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper);
if (njs_slow_path(ret != NJS_OK)) {
goto failed;
}
ctx->lower = 0x00;
}
goto decode;
}
c = *(*start)++;
if (c < 0x80) {
return c;
} else if (c <= 0xDF) {
if (c < 0xC2) {
return NJS_UNICODE_ERROR;
}
need = 1;
ctx->codepoint = c & 0x1F;
} else if (c < 0xF0) {
need = 2;
ctx->codepoint = c & 0x0F;
if (*start == end) {
njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F);
goto next;
}
ret = NJS_OK;
if (c == 0xE0) {
ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF);
} else if (c == 0xED) {
ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F);
}
if (njs_slow_path(ret != NJS_OK)) {
goto failed;
}
} else if (c < 0xF5) {
need = 3;
ctx->codepoint = c & 0x07;
if (*start == end) {
njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F);
goto next;
}
ret = NJS_OK;
if (c == 0xF0) {
ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF);
} else if (c == 0xF4) {
ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F);
}
if (njs_slow_path(ret != NJS_OK)) {
goto failed;
}
} else {
return NJS_UNICODE_ERROR;
}
decode:
for (p = *start; p < end; p++) {
c = *p;
if (c < 0x80 || c > 0xBF) {
*start = p;
goto failed;
}
ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F);
if (--need == 0) {
*start = p + 1;
return ctx->codepoint;
}
}
*start = p;
next:
ctx->need = need;
return NJS_UNICODE_CONTINUE;
failed:
ctx->lower = 0x00;
ctx->need = 0;
return NJS_UNICODE_ERROR;
}
u_char *
njs_utf8_stream_encode(njs_unicode_decode_t *ctx, const u_char *start,
const u_char *end, u_char *dst, njs_bool_t last, njs_bool_t fatal)
{
uint32_t cp;
while (start < end) {
cp = njs_utf8_decode(ctx, &start, end);
if (cp > NJS_UNICODE_MAX_CODEPOINT) {
if (cp == NJS_UNICODE_CONTINUE) {
break;
}
if (fatal) {
return NULL;
}
cp = NJS_UNICODE_REPLACEMENT;
}
dst = njs_utf8_encode(dst, cp);
}
if (last && ctx->need != 0x00) {
if (fatal) {
return NULL;
}
dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
}
return dst;
}
/*
* njs_utf8_casecmp() tests only up to the minimum of given lengths, but
* requires lengths of both strings because otherwise njs_utf8_decode()
* may fail due to incomplete sequence.
*/
njs_int_t
njs_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
size_t len2)
{
int32_t n;
uint32_t u1, u2;
const u_char *end1, *end2;
end1 = start1 + len1;
end2 = start2 + len2;
while (start1 < end1 && start2 < end2) {
u1 = njs_utf8_lower_case(&start1, end1);
u2 = njs_utf8_lower_case(&start2, end2);
if (njs_slow_path((u1 | u2) == 0xFFFFFFFF)) {
return NJS_UNICODE_ERROR;
}
n = u1 - u2;
if (n != 0) {
return (njs_int_t) n;
}
}
return 0;
}
uint32_t
njs_utf8_lower_case(const u_char **start, const u_char *end)
{
uint32_t u;
const uint32_t *block;
njs_unicode_decode_t ctx;
u = (uint32_t) **start;
if (njs_fast_path(u < 0x80)) {
(*start)++;
return njs_unicode_lower_case_block_000[u];
}
njs_utf8_decode_init(&ctx);
u = njs_utf8_decode(&ctx, start, end);
if (u <= NJS_UNICODE_MAX_LOWER_CASE) {
block = njs_unicode_lower_case_blocks[u / NJS_UNICODE_BLOCK_SIZE];
if (block != NULL) {
return block[u % NJS_UNICODE_BLOCK_SIZE];
}
}
return u;
}
uint32_t
njs_utf8_upper_case(const u_char **start, const u_char *end)
{
uint32_t u;
const uint32_t *block;
njs_unicode_decode_t ctx;
u = (uint32_t) **start;
if (njs_fast_path(u < 0x80)) {
(*start)++;
return njs_unicode_upper_case_block_000[u];
}
njs_utf8_decode_init(&ctx);
u = njs_utf8_decode(&ctx, start, end);
if (u <= NJS_UNICODE_MAX_UPPER_CASE) {
block = njs_unicode_upper_case_blocks[u / NJS_UNICODE_BLOCK_SIZE];
if (block != NULL) {
return block[u % NJS_UNICODE_BLOCK_SIZE];
}
}
return u;
}
ssize_t
njs_utf8_stream_length(njs_unicode_decode_t *ctx, const u_char *p, size_t len,
njs_bool_t last, njs_bool_t fatal, size_t *out_size)
{
size_t size, length;
uint32_t codepoint;
const u_char *end;
size = 0;
length = 0;
end = p + len;
while (p < end) {
codepoint = njs_utf8_decode(ctx, &p, end);
if (codepoint > NJS_UNICODE_MAX_CODEPOINT) {
if (codepoint == NJS_UNICODE_CONTINUE) {
break;
}
if (fatal) {
return -1;
}
codepoint = NJS_UNICODE_REPLACEMENT;
}
size += njs_utf8_size(codepoint);
length++;
}
if (last && ctx->need != 0x00) {
if (fatal) {
return -1;
}
size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
length++;
}
if (out_size != NULL) {
*out_size = size;
}
return length;
}
njs_bool_t
njs_utf8_is_valid(const u_char *p, size_t len)
{
const u_char *end;
njs_unicode_decode_t ctx;
end = p + len;
njs_utf8_decode_init(&ctx);
while (p < end) {
if (njs_slow_path(njs_utf8_decode(&ctx, &p, end)
> NJS_UNICODE_MAX_CODEPOINT))
{
return 0;
}
}
return 1;
}