Index: test/csdetect.c =================================================================== --- test/csdetect.c (revision 4665) +++ test/csdetect.c (working copy) @@ -4,9 +4,10 @@ #include #include +#include + #include -#include "charset/aliases.h" #include "charset/detect.h" #include "utils/utils.h" @@ -113,20 +114,21 @@ void run_test(const uint8_t *data, size_t len, char *expected) { - uint16_t mibenum; - hubbub_charset_source source; + uint16_t mibenum = 0; + hubbub_charset_source source = HUBBUB_CHARSET_UNKNOWN; static int testnum; - assert(hubbub_charset_extract(&data, &len, + assert(hubbub_charset_extract(data, len, &mibenum, &source) == HUBBUB_OK); assert(mibenum != 0); printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n", - ++testnum, hubbub_mibenum_to_name(mibenum), + ++testnum, parserutils_charset_mibenum_to_name(mibenum), mibenum, source, expected, - hubbub_mibenum_from_name(expected, strlen(expected))); + parserutils_charset_mibenum_from_name( + expected, strlen(expected))); - assert(mibenum == - hubbub_mibenum_from_name(expected, strlen(expected))); + assert(mibenum == parserutils_charset_mibenum_from_name( + expected, strlen(expected))); } Index: test/tokeniser2.c =================================================================== --- test/tokeniser2.c (revision 4665) +++ test/tokeniser2.c (working copy) @@ -74,6 +74,7 @@ ctx.last_start_tag = NULL; ctx.content_model = NULL; + ctx.process_cdata = false; /* Extract settings */ for (entry = json_object_get_object(test)->head; entry; Index: src/utils/utf16.c =================================================================== --- src/utils/utf16.c (revision 4665) +++ src/utils/utf16.c (working copy) @@ -1,239 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-16 manipulation functions (implementation). - */ - -#include -#include -#include - -#include "utils/utf16.h" - -/** - * Convert a UTF-16 sequence into a single UCS4 character - * - * \param s The sequence to process - * \param len Length of sequence - * \param ucs4 Pointer to location to receive UCS4 character (host endian) - * \param clen Pointer to location to receive byte length of UTF-16 sequence - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || ucs4 == NULL || clen == NULL) - return HUBBUB_BADPARM; - - if (len < 2) - return HUBBUB_NEEDDATA; - - if (*ss < 0xD800 || *ss > 0xDFFF) { - *ucs4 = *ss; - *clen = 2; - } else if (0xD800 <= *ss && *ss <= 0xBFFF) { - if (len < 4) - return HUBBUB_NEEDDATA; - - if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { - *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | - ((s[0] & 0x3f) | (s[1] & 0x3ff)); - *clen = 4; - } else { - return HUBBUB_INVALID; - } - } - - return HUBBUB_OK; -} - -/** - * Convert a single UCS4 character into a UTF-16 sequence - * - * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) - * \param s Pointer to 4 byte long output buffer - * \param len Pointer to location to receive length of multibyte sequence - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len) -{ - uint16_t *ss = (uint16_t *) (void *) s; - uint32_t l = 0; - - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - else if (ucs4 < 0x10000) { - *ss = (uint16_t) ucs4; - l = 2; - } else if (ucs4 < 0x110000) { - ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); - ss[1] = 0xDC00 | (ucs4 & 0x3ff); - l = 4; - } else { - return HUBBUB_INVALID; - } - - *len = l; - - return HUBBUB_OK; -} - -/** - * Calculate the length (in characters) of a bounded UTF-16 string - * - * \param s The string - * \param max Maximum length - * \param len Pointer to location to receive length of string - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_length(const uint8_t *s, size_t max, - size_t *len) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - const uint16_t *end = (const uint16_t *) (const void *) (s + max); - int l = 0; - - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - - while (ss < end) { - if (*ss < 0xD800 || 0xDFFF < *ss) - ss++; - else - ss += 2; - - l++; - } - - *len = l; - - return HUBBUB_OK; -} - -/** - * Calculate the length (in bytes) of a UTF-16 character - * - * \param s Pointer to start of character - * \param len Pointer to location to receive length - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_char_byte_length(const uint8_t *s, - size_t *len) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - - if (*ss < 0xD800 || 0xDFFF < *ss) - *len = 2; - else - *len = 4; - - return HUBBUB_OK; -} - -/** - * Find previous legal UTF-16 char in string - * - * \param s The string - * \param off Offset in the string to start at - * \param prevoff Pointer to location to receive offset of first byte of - * previous legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || prevoff == NULL) - return HUBBUB_BADPARM; - - if (off < 2) - *prevoff = 0; - else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) - *prevoff = off - 2; - else - *prevoff = (off < 4) ? 0 : off - 4; - - return HUBBUB_OK; -} - -/** - * Find next legal UTF-16 char in string - * - * \param s The string (assumed valid) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || off >= len || nextoff == NULL) - return HUBBUB_BADPARM; - - if (len - off < 4) - *nextoff = len; - else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) - *nextoff = off + 2; - else - *nextoff = (len - off < 6) ? len : off + 4; - - return HUBBUB_OK; -} - -/** - * Find next legal UTF-16 char in string - * - * \param s The string (assumed to be of dubious validity) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_next_paranoid(const uint8_t *s, - uint32_t len, uint32_t off, uint32_t *nextoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || off >= len || nextoff == NULL) - return HUBBUB_BADPARM; - - while (1) { - if (len - off < 4) { - return HUBBUB_NEEDDATA; - } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { - *nextoff = off + 2; - break; - } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { - if (len - off < 6) - return HUBBUB_NEEDDATA; - - if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { - *nextoff = off + 4; - break; - } else { - ss++; - off += 2; - } - } - } - - return HUBBUB_OK; -} - Index: src/utils/utf8.c =================================================================== --- src/utils/utf8.c (revision 4665) +++ src/utils/utf8.c (working copy) @@ -1,368 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-8 manipulation functions (implementation). - */ - -#include -#include -#include - -#include "utils/utf8.h" - -/** Number of continuation bytes for a given start byte */ -static const uint8_t numContinuations[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, -}; - -/** - * Convert a UTF-8 multibyte sequence into a single UCS4 character - * - * Encoding of UCS values outside the UTF-16 plane has been removed from - * RFC3629. This function conforms to RFC2279, however. - * - * \param s The sequence to process - * \param len Length of sequence - * \param ucs4 Pointer to location to receive UCS4 character (host endian) - * \param clen Pointer to location to receive byte length of UTF-8 sequence - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen) -{ - if (s == NULL || ucs4 == NULL || clen == NULL) - return HUBBUB_BADPARM; - - if (len == 0) - return HUBBUB_NEEDDATA; - - if (*s < 0x80) { - *ucs4 = *s; - *clen = 1; - } else if ((*s & 0xE0) == 0xC0) { - if (len < 2) - return HUBBUB_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80) - return HUBBUB_INVALID; - else { - *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); - *clen = 2; - } - } else if ((*s & 0xF0) == 0xE0) { - if (len < 3) - return HUBBUB_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80) - return HUBBUB_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 12) | - ((*(s+1) & 0x3F) << 6) | - (*(s+2) & 0x3F); - *clen = 3; - } - } else if ((*s & 0xF8) == 0xF0) { - if (len < 4) - return HUBBUB_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80) - return HUBBUB_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 18) | - ((*(s+1) & 0x3F) << 12) | - ((*(s+2) & 0x3F) << 6) | - (*(s+3) & 0x3F); - *clen = 4; - } - } else if ((*s & 0xFC) == 0xF8) { - if (len < 5) - return HUBBUB_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80 || - (*(s+4) & 0xC0) != 0x80) - return HUBBUB_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 24) | - ((*(s+1) & 0x3F) << 18) | - ((*(s+2) & 0x3F) << 12) | - ((*(s+3) & 0x3F) << 6) | - (*(s+4) & 0x3F); - *clen = 5; - } - } else if ((*s & 0xFE) == 0xFC) { - if (len < 6) - return HUBBUB_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80 || - (*(s+4) & 0xC0) != 0x80 || - (*(s+5) & 0xC0) != 0x80) - return HUBBUB_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 28) | - ((*(s+1) & 0x3F) << 24) | - ((*(s+2) & 0x3F) << 18) | - ((*(s+3) & 0x3F) << 12) | - ((*(s+4) & 0x3F) << 6) | - (*(s+5) & 0x3F); - *clen = 6; - } - } else { - return HUBBUB_INVALID; - } - - return HUBBUB_OK; -} - -/** - * Convert a single UCS4 character into a UTF-8 multibyte sequence - * - * Encoding of UCS values outside the UTF-16 plane has been removed from - * RFC3629. This function conforms to RFC2279, however. - * - * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) - * \param s Pointer to 6 byte long output buffer - * \param len Pointer to location to receive length of multibyte sequence - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len) -{ - uint32_t l = 0; - - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - else if (ucs4 < 0x80) { - *s = (uint8_t) ucs4; - l = 1; - } else if (ucs4 < 0x800) { - *s = 0xC0 | ((ucs4 >> 6) & 0x1F); - *(s+1) = 0x80 | (ucs4 & 0x3F); - l = 2; - } else if (ucs4 < 0x10000) { - *s = 0xE0 | ((ucs4 >> 12) & 0xF); - *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+2) = 0x80 | (ucs4 & 0x3F); - l = 3; - } else if (ucs4 < 0x200000) { - *s = 0xF0 | ((ucs4 >> 18) & 0x7); - *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+3) = 0x80 | (ucs4 & 0x3F); - l = 4; - } else if (ucs4 < 0x4000000) { - *s = 0xF8 | ((ucs4 >> 24) & 0x3); - *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+4) = 0x80 | (ucs4 & 0x3F); - l = 5; - } else if (ucs4 <= 0x7FFFFFFF) { - *s = 0xFC | ((ucs4 >> 30) & 0x1); - *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); - *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+5) = 0x80 | (ucs4 & 0x3F); - l = 6; - } else { - return HUBBUB_INVALID; - } - - *len = l; - - return HUBBUB_OK; -} - -/** - * Calculate the length (in characters) of a bounded UTF-8 string - * - * \param s The string - * \param max Maximum length - * \param len Pointer to location to receive length of string - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, - size_t *len) -{ - const uint8_t *end = s + max; - int l = 0; - - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - - while (s < end) { - if ((*s & 0x80) == 0x00) - s += 1; - else if ((*s & 0xE0) == 0xC0) - s += 2; - else if ((*s & 0xF0) == 0xE0) - s += 3; - else if ((*s & 0xF8) == 0xF0) - s += 4; - else if ((*s & 0xFC) == 0xF8) - s += 5; - else if ((*s & 0xFE) == 0xFC) - s += 6; - else - return HUBBUB_INVALID; - l++; - } - - *len = l; - - return HUBBUB_OK; -} - -/** - * Calculate the length (in bytes) of a UTF-8 character - * - * \param s Pointer to start of character - * \param len Pointer to location to receive length - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, - size_t *len) -{ - if (s == NULL || len == NULL) - return HUBBUB_BADPARM; - - *len = numContinuations[s[0]] + 1 /* Start byte */; - - return HUBBUB_OK; -} - -/** - * Find previous legal UTF-8 char in string - * - * \param s The string - * \param off Offset in the string to start at - * \param prevoff Pointer to location to receive offset of first byte of - * previous legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff) -{ - if (s == NULL || prevoff == NULL) - return HUBBUB_BADPARM; - - while (off != 0 && (s[--off] & 0xC0) == 0x80) - /* do nothing */; - - *prevoff = off; - - return HUBBUB_OK; -} - -/** - * Find next legal UTF-8 char in string - * - * \param s The string (assumed valid) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - if (s == NULL || off >= len || nextoff == NULL) - return HUBBUB_BADPARM; - - /* Skip current start byte (if present - may be mid-sequence) */ - if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) - off++; - - while (off < len && (s[off] & 0xC0) == 0x80) - off++; - - *nextoff = off; - - return HUBBUB_OK; -} - -/** - * Find next legal UTF-8 char in string - * - * \param s The string (assumed to be of dubious validity) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - bool valid; - - if (s == NULL || off >= len || nextoff == NULL) - return HUBBUB_BADPARM; - - /* Skip current start byte (if present - may be mid-sequence) */ - if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) - off++; - - while (1) { - /* Find next possible start byte */ - while (off < len && (s[off] & 0xC0) == 0x80) - off++; - - /* Ran off end of data */ - if (off == len || off + numContinuations[s[off]] >= len) - return HUBBUB_NEEDDATA; - - /* Found if start byte is ascii, - * or next n bytes are valid continuations */ - valid = true; - - switch (numContinuations[s[off]]) { - case 5: - valid &= ((s[off + 5] & 0xC0) == 0x80); - case 4: - valid &= ((s[off + 4] & 0xC0) == 0x80); - case 3: - valid &= ((s[off + 3] & 0xC0) == 0x80); - case 2: - valid &= ((s[off + 2] & 0xC0) == 0x80); - case 1: - valid &= ((s[off + 1] & 0xC0) == 0x80); - case 0: - valid &= (s[off + 0] < 0x80); - } - - if (valid) - break; - - /* Otherwise, skip this (invalid) start byte and try again */ - off++; - } - - *nextoff = off; - - return HUBBUB_OK; -} - Index: src/utils/utf16.h =================================================================== --- src/utils/utf16.h (revision 4665) +++ src/utils/utf16.h (working copy) @@ -1,38 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-16 manipulation functions (interface). - */ - -#ifndef hubbub_utils_utf16_h_ -#define hubbub_utils_utf16_h_ - -#include - -#include - -hubbub_error hubbub_utf16_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen); -hubbub_error hubbub_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len); - -hubbub_error hubbub_utf16_length(const uint8_t *s, size_t max, - size_t *len); -hubbub_error hubbub_utf16_char_byte_length(const uint8_t *s, - size_t *len); - -hubbub_error hubbub_utf16_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff); -hubbub_error hubbub_utf16_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -hubbub_error hubbub_utf16_next_paranoid(const uint8_t *s, - uint32_t len, uint32_t off, uint32_t *nextoff); - -#endif - Index: src/utils/utf8.h =================================================================== --- src/utils/utf8.h (revision 4665) +++ src/utils/utf8.h (working copy) @@ -1,38 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-8 manipulation functions (interface). - */ - -#ifndef hubbub_utils_utf8_h_ -#define hubbub_utils_utf8_h_ - -#include - -#include - -hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen); -hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len); - -hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, - size_t *len); -hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, - size_t *len); - -hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff); -hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -#endif - Index: src/utils/Makefile =================================================================== --- src/utils/Makefile (revision 4665) +++ src/utils/Makefile (working copy) @@ -32,7 +32,7 @@ d := $(DIR) # Sources -SRCS_$(d) := dict.c errors.c utf8.c utf16.c string.c +SRCS_$(d) := dict.c errors.c string.c # Append to sources for component SOURCES += $(addprefix $(d), $(SRCS_$(d))) Index: src/tokeniser/tokeniser.c =================================================================== --- src/tokeniser/tokeniser.c (revision 4665) +++ src/tokeniser/tokeniser.c (working copy) @@ -10,8 +10,9 @@ #include +#include + #include "utils/utils.h" -#include "utils/utf8.h" #include "tokeniser/entities.h" #include "tokeniser/tokeniser.h" @@ -682,18 +683,18 @@ hubbub_token token; uint8_t utf8[6]; - size_t len; + uint8_t *utf8ptr = utf8; + size_t len = sizeof(utf8); token.type = HUBBUB_TOKEN_CHARACTER; if (tokeniser->context.match_entity.codepoint) { - hubbub_utf8_from_ucs4( - tokeniser->context.match_entity.codepoint, - utf8, - &len); + parserutils_charset_utf8_from_ucs4( + tokeniser->context.match_entity.codepoint, + &utf8ptr, &len); token.data.character.ptr = utf8; - token.data.character.len = len; + token.data.character.len = sizeof(utf8) - len; hubbub_tokeniser_emit_token(tokeniser, &token); @@ -1433,13 +1434,13 @@ hubbub_tag *ctag = &tokeniser->context.current_tag; uint8_t utf8[6]; - size_t len; + uint8_t *utf8ptr = utf8; + size_t len = sizeof(utf8); if (tokeniser->context.match_entity.codepoint) { - hubbub_utf8_from_ucs4( - tokeniser->context.match_entity.codepoint, - utf8, - &len); + parserutils_charset_utf8_from_ucs4( + tokeniser->context.match_entity.codepoint, + &utf8ptr, &len); /* +1 for the ampersand */ COLLECT(tokeniser->context.chars, 0, @@ -1448,7 +1449,7 @@ hubbub_buffer_add(tokeniser, &ctag->attributes[ ctag->n_attributes - 1].value, - (uintptr_t) utf8, len); + (uintptr_t) utf8, sizeof(utf8) - len); } else { size_t len; uintptr_t cptr = parserutils_inputstream_peek( @@ -2754,12 +2755,11 @@ /* Look at the character after the ampersand */ cptr = parserutils_inputstream_peek(tokeniser->input, off, &len); - - uint8_t c = CHAR(cptr); - if (cptr == PARSERUTILS_INPUTSTREAM_OOD) return false; + uint8_t c = CHAR(cptr); + /* Set things up */ tokeniser->context.match_entity.offset = off; tokeniser->context.match_entity.poss_length = 0; Index: src/hubbub.c =================================================================== --- src/hubbub.c (revision 4665) +++ src/hubbub.c (working copy) @@ -9,7 +9,6 @@ #include -#include "charset/aliases.h" #include "tokeniser/entities.h" /** Index: src/charset/aliases.c =================================================================== --- src/charset/aliases.c (revision 4665) +++ src/charset/aliases.c (working copy) @@ -1,361 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#include -#include -#include -#include -#include -#include - -#include "charset/aliases.h" - -struct alias { - struct alias *next; - hubbub_aliases_canon *canon; - uint16_t name_len; - char name[1]; -}; - -#define HASH_SIZE (43) -static hubbub_aliases_canon *canon_tab[HASH_SIZE]; -static struct alias *alias_tab[HASH_SIZE]; - -static hubbub_error hubbub_create_alias(const char *alias, - hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw); -static hubbub_aliases_canon *hubbub_create_canon(const char *canon, - uint16_t mibenum, hubbub_alloc alloc, void *pw); -static uint32_t hubbub_hash_val(const char *alias, size_t len); - -/** - * Create alias data from Aliases file - * - * \param filename The path to the Aliases file - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return HUBBUB_OK on success, appropriate error otherwise. - */ -hubbub_error hubbub_aliases_create(const char *filename, - hubbub_alloc alloc, void *pw) -{ - char buf[300]; - FILE *fp; - - if (filename == NULL || alloc == NULL) - return HUBBUB_BADPARM; - - fp = fopen(filename, "r"); - if (fp == NULL) - return HUBBUB_FILENOTFOUND; - - while (fgets(buf, sizeof buf, fp)) { - char *p, *aliases = 0, *mib, *end; - hubbub_aliases_canon *cf; - - if (buf[0] == 0 || buf[0] == '#') - /* skip blank lines or comments */ - continue; - - buf[strlen(buf) - 1] = 0; /* lose terminating newline */ - end = buf + strlen(buf); - - /* find end of canonical form */ - for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - *p++ = '\0'; /* terminate canonical form */ - - /* skip whitespace */ - for (; *p && isspace(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - mib = p; - - /* find end of mibenum */ - for (; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p < end) - *p++ = '\0'; /* terminate mibenum */ - - cf = hubbub_create_canon(buf, atoi(mib), alloc, pw); - if (cf == NULL) - continue; - - /* skip whitespace */ - for (; p < end && *p && isspace(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - aliases = p; - - while (p < end) { - /* find end of alias */ - for (; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p > end) - /* stop if we've gone past the end */ - break; - /* terminate current alias */ - *p++ = '\0'; - - if (hubbub_create_alias(aliases, cf, - alloc, pw) != HUBBUB_OK) - break; - - /* in terminating, we may have advanced - * past the end - check this here */ - if (p >= end) - break; - - /* skip whitespace */ - for (; *p && isspace(*p); p++) - ; /* do nothing */ - - if (p >= end) - /* gone past end => stop */ - break; - - /* update pointer to current alias */ - aliases = p; - } - } - - fclose(fp); - - return HUBBUB_OK; -} - -/** - * Free all alias data - * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - */ -void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw) -{ - hubbub_aliases_canon *c, *d; - struct alias *a, *b; - int i; - - for (i = 0; i != HASH_SIZE; i++) { - for (c = canon_tab[i]; c; c = d) { - d = c->next; - alloc(c, 0, pw); - } - canon_tab[i] = NULL; - - for (a = alias_tab[i]; a; a = b) { - b = a->next; - alloc(a, 0, pw); - } - alias_tab[i] = NULL; - } -} - -/** - * Retrieve the MIB enum value assigned to an encoding name - * - * \param alias The alias to lookup - * \param len The length of the alias string - * \return The MIB enum value, or 0 if not found - */ -uint16_t hubbub_mibenum_from_name(const char *alias, size_t len) -{ - hubbub_aliases_canon *c; - - if (alias == NULL) - return 0; - - c = hubbub_alias_canonicalise(alias, len); - if (c == NULL) - return 0; - - return c->mib_enum; -} - -/** - * Retrieve the canonical name of an encoding from the MIB enum - * - * \param mibenum The MIB enum value - * \return Pointer to canonical name, or NULL if not found - */ -const char *hubbub_mibenum_to_name(uint16_t mibenum) -{ - int i; - hubbub_aliases_canon *c; - - for (i = 0; i != HASH_SIZE; i++) - for (c = canon_tab[i]; c; c = c->next) - if (c->mib_enum == mibenum) - return c->name; - - return NULL; -} - - -/** - * Retrieve the canonical form of an alias name - * - * \param alias The alias name - * \param len The length of the alias name - * \return Pointer to canonical form or NULL if not found - */ -hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, - size_t len) -{ - uint32_t hash; - hubbub_aliases_canon *c; - struct alias *a; - - if (alias == NULL) - return NULL; - - hash = hubbub_hash_val(alias, len); - - for (c = canon_tab[hash]; c; c = c->next) - if (c->name_len == len && - strncasecmp(c->name, alias, len) == 0) - break; - if (c) - return c; - - for (a = alias_tab[hash]; a; a = a->next) - if (a->name_len == len && - strncasecmp(a->name, alias, len) == 0) - break; - if (a) - return a->canon; - - return NULL; -} - - -/** - * Create an alias - * - * \param alias The alias name - * \param c The canonical form - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c, - hubbub_alloc alloc, void *pw) -{ - struct alias *a; - uint32_t hash; - - if (alias == NULL || c == NULL || alloc == NULL) - return HUBBUB_BADPARM; - - a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); - if (a == NULL) - return HUBBUB_NOMEM; - - a->canon = c; - a->name_len = strlen(alias); - strcpy(a->name, alias); - a->name[a->name_len] = '\0'; - - hash = hubbub_hash_val(alias, a->name_len); - - a->next = alias_tab[hash]; - alias_tab[hash] = a; - - return HUBBUB_OK; -} - -/** - * Create a canonical form - * - * \param canon The canonical name - * \param mibenum The MIB enum value - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to canonical form or NULL on error - */ -hubbub_aliases_canon *hubbub_create_canon(const char *canon, - uint16_t mibenum, hubbub_alloc alloc, void *pw) -{ - hubbub_aliases_canon *c; - uint32_t hash, len; - - if (canon == NULL || alloc == NULL) - return NULL; - - len = strlen(canon); - - c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw); - if (c == NULL) - return NULL; - - c->mib_enum = mibenum; - c->name_len = len; - strcpy(c->name, canon); - c->name[len] = '\0'; - - hash = hubbub_hash_val(canon, len); - - c->next = canon_tab[hash]; - canon_tab[hash] = c; - - return c; -} - -/** - * Hash function - * - * \param alias String to hash - * \return The hashed value - */ -uint32_t hubbub_hash_val(const char *alias, size_t len) -{ - const char *s = alias; - uint32_t h = 5381; - - if (alias == NULL) - return 0; - - while (len--) - h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ - - return h % HASH_SIZE; -} - - -#ifndef NDEBUG -/** - * Dump all alias data to stdout - */ -void hubbub_aliases_dump(void) -{ - hubbub_aliases_canon *c; - struct alias *a; - int i; - size_t size = 0; - - for (i = 0; i != HASH_SIZE; i++) { - for (c = canon_tab[i]; c; c = c->next) { - printf("%d %s\n", i, c->name); - size += offsetof(hubbub_aliases_canon, name) + - c->name_len; - } - - for (a = alias_tab[i]; a; a = a->next) { - printf("%d %s\n", i, a->name); - size += offsetof(struct alias, name) + a->name_len; - } - } - - size += (sizeof(canon_tab) / sizeof(canon_tab[0])); - size += (sizeof(alias_tab) / sizeof(alias_tab[0])); - - printf("%u\n", (unsigned int) size); -} -#endif Index: src/charset/codec.h =================================================================== --- src/charset/codec.h (revision 4665) +++ src/charset/codec.h (working copy) @@ -1,153 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#ifndef hubbub_charset_codec_h_ -#define hubbub_charset_codec_h_ - -#include - -#include -#include - -typedef struct hubbub_charsetcodec hubbub_charsetcodec; - -#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU) - -/** - * Type of charset codec filter function - * - * \param c UCS4 character (in host byte order) or - * HUBBUB_CHARSETCODEC_NULL to reset - * \param output Pointer to location to store output buffer location - * \param outputlen Pointer to location to store output buffer length - * \param pw Pointer to client-specific private data - * \return HUBBUB_OK on success, or appropriate error otherwise. - * - * The output buffer is owned by the filter code and will not be freed by - * any charset codec. It should contain the replacement UCS4 character(s) - * for the input. The replacement characters should be in host byte order. - * The contents of *output and *outputlen on entry are ignored and these - * will be filled in by the filter code. - * - * Filters may elect to replace the input character with no output. In this - * case, *output should be set to NULL and *outputlen should be set to 0 and - * HUBBUB_OK should be returned. - * - * The output length is in terms of the number of UCS4 characters in the - * output buffer. i.e.: - * - * for (size_t i = 0; i < outputlen; i++) { - * dest[curchar++] = output[i]; - * } - * - * would copy the contents of the filter output buffer to the codec's output - * buffer. - */ -typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c, - uint32_t **output, size_t *outputlen, void *pw); - -/** - * Charset codec error mode - * - * A codec's error mode determines its behaviour in the face of: - * - * + characters which are unrepresentable in the destination charset (if - * encoding data) or which cannot be converted to UCS4 (if decoding data). - * + invalid byte sequences (both encoding and decoding) - * - * The options provide a choice between the following approaches: - * - * + draconian, "stop processing" ("strict") - * + "replace the unrepresentable character with something else" ("loose") - * + "attempt to transliterate, or replace if unable" ("translit") - * - * The default error mode is "loose". - * - * - * In the "loose" case, the replacement character will depend upon: - * - * + Whether the operation was encoding or decoding - * + If encoding, what the destination charset is. - * - * If decoding, the replacement character will be: - * - * U+FFFD (REPLACEMENT CHARACTER) - * - * If encoding, the replacement character will be: - * - * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) - * U+FFFD (REPLACEMENT CHARACTER) otherwise. - * - * - * In the "translit" case, the codec will attempt to transliterate into - * the destination charset, if encoding. If decoding, or if transliteration - * fails, this option is identical to "loose". - */ -typedef enum hubbub_charsetcodec_errormode { - /** Abort processing if unrepresentable character encountered */ - HUBBUB_CHARSETCODEC_ERROR_STRICT = 0, - /** Replace unrepresentable characters with single alternate */ - HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1, - /** Transliterate unrepresentable characters, if possible */ - HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2, -} hubbub_charsetcodec_errormode; - -/** - * Charset codec option types - */ -typedef enum hubbub_charsetcodec_opttype { - /** Register codec filter function */ - HUBBUB_CHARSETCODEC_FILTER_FUNC = 0, - /** Set codec error mode */ - HUBBUB_CHARSETCODEC_ERROR_MODE = 1, -} hubbub_charsetcodec_opttype; - -/** - * Charset codec option parameters - */ -typedef union hubbub_charsetcodec_optparams { - /** Parameters for filter function setting */ - struct { - /** Filter function */ - hubbub_charsetcodec_filter filter; - /** Client-specific private data */ - void *pw; - } filter_func; - - /** Parameters for error mode setting */ - struct { - /** The desired error handling mode */ - hubbub_charsetcodec_errormode mode; - } error_mode; -} hubbub_charsetcodec_optparams; - - -/* Create a charset codec */ -hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, - hubbub_alloc alloc, void *pw); -/* Destroy a charset codec */ -void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec); - -/* Configure a charset codec */ -hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, - hubbub_charsetcodec_opttype type, - hubbub_charsetcodec_optparams *params); - -/* Encode a chunk of UCS4 data into a codec's charset */ -hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); - -/* Decode a chunk of data in a codec's charset into UCS4 */ -hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); - -/* Reset a charset codec */ -hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec); - -#endif Index: src/charset/codec_utf16.c =================================================================== --- src/charset/codec_utf16.c (revision 4665) +++ src/charset/codec_utf16.c (working copy) @@ -1,620 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#include -#include - -/* These two are for htonl / ntohl */ -#include -#include - -#include "charset/aliases.h" -#include "utils/utf16.h" -#include "utils/utils.h" - -#include "codec_impl.h" - -/** - * UTF-16 charset codec - */ -typedef struct hubbub_utf16_codec { - hubbub_charsetcodec base; /**< Base class */ - -#define INVAL_BUFSIZE (32) - uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up - * incomplete input - * sequences */ - size_t inval_len; /*< Byte length of inval_buf **/ - -#define READ_BUFSIZE (8) - uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial - * output sequences (decode) - * (host-endian) */ - size_t read_len; /**< Character length of read_buf */ - -#define WRITE_BUFSIZE (8) - uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial - * output sequences (encode) - * (host-endian) */ - size_t write_len; /**< Character length of write_buf */ - -} hubbub_utf16_codec; - -static bool hubbub_utf16_codec_handles_charset(const char *charset); -static hubbub_charsetcodec *hubbub_utf16_codec_create(const char *charset, - hubbub_alloc alloc, void *pw); -static void hubbub_utf16_codec_destroy (hubbub_charsetcodec *codec); -static hubbub_error hubbub_utf16_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf16_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf16_codec_reset(hubbub_charsetcodec *codec); -static hubbub_error hubbub_utf16_codec_read_char(hubbub_utf16_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf16_codec_filter_decoded_char( - hubbub_utf16_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen); - -/** - * Determine whether this codec handles a specific charset - * - * \param charset Charset to test - * \return true if handleable, false otherwise - */ -bool hubbub_utf16_codec_handles_charset(const char *charset) -{ - return hubbub_mibenum_from_name(charset, strlen(charset)) == - hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); -} - -/** - * Create a utf16 codec - * - * \param charset The charset to read from / write to - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to codec, or NULL on failure - */ -hubbub_charsetcodec *hubbub_utf16_codec_create(const char *charset, - hubbub_alloc alloc, void *pw) -{ - hubbub_utf16_codec *codec; - - UNUSED(charset); - - codec = alloc(NULL, sizeof(hubbub_utf16_codec), pw); - if (codec == NULL) - return NULL; - - codec->inval_buf[0] = '\0'; - codec->inval_len = 0; - - codec->read_buf[0] = 0; - codec->read_len = 0; - - codec->write_buf[0] = 0; - codec->write_len = 0; - - /* Finally, populate vtable */ - codec->base.handler.destroy = hubbub_utf16_codec_destroy; - codec->base.handler.encode = hubbub_utf16_codec_encode; - codec->base.handler.decode = hubbub_utf16_codec_decode; - codec->base.handler.reset = hubbub_utf16_codec_reset; - - return (hubbub_charsetcodec *) codec; -} - -/** - * Destroy a utf16 codec - * - * \param codec The codec to destroy - */ -void hubbub_utf16_codec_destroy (hubbub_charsetcodec *codec) -{ - UNUSED(codec); -} - -/** - * Encode a chunk of UCS4 data into utf16 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read. Any remaining output for the character will be buffered by the - * codec for writing on the next call. This buffered data is post-filtering, - * so will not be refiltered on the next call. - * - * In the case of the filter function failing, ::source will point _at_ the - * last input character read; nothing will be written or buffered for the - * failed character. It is up to the client to fix the cause of the failure - * and retry the encoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately before the location pointed to by - * ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_utf16_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec; - uint32_t ucs4; - uint32_t *towrite; - size_t towritelen; - hubbub_error error; - - /* Process any outstanding characters from the previous call */ - if (c->write_len > 0) { - uint32_t *pwrite = c->write_buf; - uint8_t buf[4]; - size_t len; - - while (c->write_len > 0) { - error = hubbub_utf16_from_ucs4(pwrite[0], buf, &len); - if (error != HUBBUB_OK) - abort(); - - if (*destlen < len) { - /* Insufficient output buffer space */ - for (len = 0; len < c->write_len; len++) - c->write_buf[len] = pwrite[len]; - - return HUBBUB_NOMEM; - } - - memcpy(*dest, buf, len); - - *dest += len; - *destlen -= len; - - pwrite++; - c->write_len--; - } - } - - /* Now process the characters for this call */ - while (*sourcelen > 0) { - ucs4 = ntohl(*((uint32_t *) (void *) *source)); - towrite = &ucs4; - towritelen = 1; - - /* Run character we're about to output through the - * registered filter, so it can replace it. */ - if (c->base.filter != NULL) { - error = c->base.filter(ucs4, - &towrite, &towritelen, - c->base.filter_pw); - if (error != HUBBUB_OK) - return error; - } - - /* Output current characters */ - while (towritelen > 0) { - uint8_t buf[4]; - size_t len; - - error = hubbub_utf16_from_ucs4(towrite[0], buf, &len); - if (error != HUBBUB_OK) - abort(); - - if (*destlen < len) { - /* Insufficient output space */ - if (towritelen >= WRITE_BUFSIZE) - abort(); - - c->write_len = towritelen; - - /* Copy pending chars to save area, for - * processing next call. */ - for (len = 0; len < towritelen; len++) - c->write_buf[len] = towrite[len]; - - /* Claim character we've just buffered, - * so it's not reprocessed */ - *source += 4; - *sourcelen -= 4; - - return HUBBUB_NOMEM; - } - - memcpy(*dest, buf, len); - - *dest += len; - *destlen -= len; - - towrite++; - towritelen--; - } - - *source += 4; - *sourcelen -= 4; - } - - return HUBBUB_OK; -} - -/** - * Decode a chunk of utf16 data into UCS4 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately at or before the location pointed - * to by ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * If STRICT error handling is configured and an illegal sequence is split - * over two calls, then _INVALID will be returned from the second call, - * but ::source will point mid-way through the invalid sequence (i.e. it - * will be unmodified over the second call). In addition, the internal - * incomplete-sequence buffer will be emptied, such that subsequent calls - * will progress, rather than re-evaluating the same invalid sequence. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - * - * Call this with a source length of 0 to flush the output buffer. - */ -hubbub_error hubbub_utf16_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec; - hubbub_error error; - - if (c->read_len > 0) { - /* Output left over from last decode */ - uint32_t *pread = c->read_buf; - - while (c->read_len > 0 && *destlen >= c->read_len * 4) { - *((uint32_t *) (void *) *dest) = htonl(pread[0]); - - *dest += 4; - *destlen -= 4; - - pread++; - c->read_len--; - } - - if (*destlen < c->read_len * 4) { - /* Ran out of output buffer */ - size_t i; - - /* Shuffle remaining output down */ - for (i = 0; i < c->read_len; i++) - c->read_buf[i] = pread[i]; - - return HUBBUB_NOMEM; - } - } - - if (c->inval_len > 0) { - /* The last decode ended in an incomplete sequence. - * Fill up inval_buf with data from the start of the - * new chunk and process it. */ - uint8_t *in = c->inval_buf; - size_t ol = c->inval_len; - size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); - size_t orig_l = l; - - memcpy(c->inval_buf + ol, *source, l); - - l += c->inval_len; - - error = hubbub_utf16_codec_read_char(c, - (const uint8_t **) &in, &l, dest, destlen); - if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { - return error; - } - - /* And now, fix up source pointers */ - *source += max((signed) (orig_l - l), 0); - *sourcelen -= max((signed) (orig_l - l), 0); - - /* Failed to resolve an incomplete character and - * ran out of buffer space. No recovery strategy - * possible, so explode everywhere. */ - if ((orig_l + ol) - l == 0) - abort(); - - /* Report memory exhaustion case from above */ - if (error != HUBBUB_OK) - return error; - } - - /* Finally, the "normal" case; process all outstanding characters */ - while (*sourcelen > 0) { - error = hubbub_utf16_codec_read_char(c, - source, sourcelen, dest, destlen); - if (error != HUBBUB_OK) { - return error; - } - } - - return HUBBUB_OK; -} - -/** - * Clear a utf16 codec's encoding state - * - * \param codec The codec to reset - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16_codec_reset(hubbub_charsetcodec *codec) -{ - hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec; - - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - c->read_buf[0] = 0; - c->read_len = 0; - - c->write_buf[0] = 0; - c->write_len = 0; - - return HUBBUB_OK; -} - - -/** - * Read a character from the UTF-16 to UCS4 (big endian) - * - * \param c The codec - * \param source Pointer to pointer to source buffer (updated on exit) - * \param sourcelen Pointer to length of source buffer (updated on exit) - * \param dest Pointer to pointer to output buffer (updated on exit) - * \param destlen Pointer to length of output buffer (updated on exit) - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_utf16_codec_read_char(hubbub_utf16_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - uint32_t ucs4; - size_t sucs4; - hubbub_error error; - - /* Convert a single character */ - error = hubbub_utf16_to_ucs4(*source, *sourcelen, &ucs4, &sucs4); - if (error == HUBBUB_OK) { - /* Read a character */ - error = hubbub_utf16_codec_filter_decoded_char(c, - ucs4, dest, destlen); - if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { - /* filter function succeeded; update source pointers */ - *source += sucs4; - *sourcelen -= sucs4; - } - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - return error; - } else if (error == HUBBUB_NEEDDATA) { - /* Incomplete input sequence */ - if (*sourcelen > INVAL_BUFSIZE) - abort(); - - memmove(c->inval_buf, (char *) *source, *sourcelen); - c->inval_buf[*sourcelen] = '\0'; - c->inval_len = *sourcelen; - - *source += *sourcelen; - *sourcelen = 0; - - return HUBBUB_OK; - } else if (error == HUBBUB_INVALID) { - /* Illegal input sequence */ - uint32_t nextchar; - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - /* Strict errormode; simply flag invalid character */ - if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { - return HUBBUB_INVALID; - } - - /* Find next valid UTF-16 sequence. - * We're processing client-provided data, so let's - * be paranoid about its validity. */ - error = hubbub_utf16_next_paranoid(*source, *sourcelen, - 0, &nextchar); - if (error != HUBBUB_OK) { - if (error == HUBBUB_NEEDDATA) { - /* Need more data to be sure */ - if (*sourcelen > INVAL_BUFSIZE) - abort(); - - memmove(c->inval_buf, (char *) *source, - *sourcelen); - c->inval_buf[*sourcelen] = '\0'; - c->inval_len = *sourcelen; - - *source += *sourcelen; - *sourcelen = 0; - - nextchar = 0; - } else { - return error; - } - } - - /* output U+FFFD and continue processing. */ - error = hubbub_utf16_codec_filter_decoded_char(c, - 0xFFFD, dest, destlen); - if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { - /* filter function succeeded; update source pointers */ - *source += nextchar; - *sourcelen -= nextchar; - } - - return error; - } - - return HUBBUB_OK; -} - -/** - * Feed a UCS4 character through the registered filter and output the result - * - * \param c Codec to use - * \param ucs4 UCS4 character (host endian) - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to output buffer length - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * as a result of the failure of the - * client-provided filter function. - */ -hubbub_error hubbub_utf16_codec_filter_decoded_char(hubbub_utf16_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen) -{ - if (c->base.filter != NULL) { - uint32_t *rep; - size_t replen; - hubbub_error error; - - error = c->base.filter(ucs4, &rep, &replen, - c->base.filter_pw); - if (error != HUBBUB_OK) { - return error; - } - - while (replen > 0 && *destlen >= replen * 4) { - *((uint32_t *) (void *) *dest) = htonl(*rep); - - *dest += 4; - *destlen -= 4; - - rep++; - replen--; - } - - if (*destlen < replen * 4) { - /* Run out of output buffer */ - size_t i; - - /* Buffer remaining output */ - c->read_len = replen; - - for (i = 0; i < replen; i++) { - c->read_buf[i] = rep[i]; - } - - return HUBBUB_NOMEM; - } - - } else { - if (*destlen < 4) { - /* Run out of output buffer */ - c->read_len = 1; - c->read_buf[0] = ucs4; - - return HUBBUB_NOMEM; - } - - *((uint32_t *) (void *) *dest) = htonl(ucs4); - *dest += 4; - *destlen -= 4; - } - - return HUBBUB_OK; -} - - -const hubbub_charsethandler hubbub_utf16_codec_handler = { - hubbub_utf16_codec_handles_charset, - hubbub_utf16_codec_create -}; Index: src/charset/codec_impl.h =================================================================== --- src/charset/codec_impl.h (revision 4665) +++ src/charset/codec_impl.h (working copy) @@ -1,51 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#ifndef hubbub_charset_codecimpl_h_ -#define hubbub_charset_codecimpl_h_ - -#include -#include - -#include "codec.h" - -/** - * Core charset codec definition; implementations extend this - */ -struct hubbub_charsetcodec { - uint16_t mibenum; /**< MIB enum for charset */ - - hubbub_charsetcodec_filter filter; /**< filter function */ - void *filter_pw; /**< filter private word */ - - hubbub_charsetcodec_errormode errormode; /**< error mode */ - - hubbub_alloc alloc; /**< allocation function */ - void *alloc_pw; /**< private word */ - - struct { - void (*destroy)(hubbub_charsetcodec *codec); - hubbub_error (*encode)(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); - hubbub_error (*decode)(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); - hubbub_error (*reset)(hubbub_charsetcodec *codec); - } handler; /**< Vtable for handler code */ -}; - -/** - * Codec factory component definition - */ -typedef struct hubbub_charsethandler { - bool (*handles_charset)(const char *charset); - hubbub_charsetcodec *(*create)(const char *charset, - hubbub_alloc alloc, void *pw); -} hubbub_charsethandler; - -#endif Index: src/charset/codec_utf8.c =================================================================== --- src/charset/codec_utf8.c (revision 4665) +++ src/charset/codec_utf8.c (working copy) @@ -1,620 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#include -#include - -/* These two are for htonl / ntohl */ -#include -#include - -#include "charset/aliases.h" -#include "utils/utf8.h" -#include "utils/utils.h" - -#include "codec_impl.h" - -/** - * UTF-8 charset codec - */ -typedef struct hubbub_utf8_codec { - hubbub_charsetcodec base; /**< Base class */ - -#define INVAL_BUFSIZE (32) - uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up - * incomplete input - * sequences */ - size_t inval_len; /*< Byte length of inval_buf **/ - -#define READ_BUFSIZE (8) - uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial - * output sequences (decode) - * (host-endian) */ - size_t read_len; /**< Character length of read_buf */ - -#define WRITE_BUFSIZE (8) - uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial - * output sequences (encode) - * (host-endian) */ - size_t write_len; /**< Character length of write_buf */ - -} hubbub_utf8_codec; - -static bool hubbub_utf8_codec_handles_charset(const char *charset); -static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, - hubbub_alloc alloc, void *pw); -static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec); -static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec); -static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_utf8_codec_filter_decoded_char( - hubbub_utf8_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen); - -/** - * Determine whether this codec handles a specific charset - * - * \param charset Charset to test - * \return true if handleable, false otherwise - */ -bool hubbub_utf8_codec_handles_charset(const char *charset) -{ - return hubbub_mibenum_from_name(charset, strlen(charset)) == - hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); -} - -/** - * Create a utf8 codec - * - * \param charset The charset to read from / write to - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to codec, or NULL on failure - */ -hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, - hubbub_alloc alloc, void *pw) -{ - hubbub_utf8_codec *codec; - - UNUSED(charset); - - codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw); - if (codec == NULL) - return NULL; - - codec->inval_buf[0] = '\0'; - codec->inval_len = 0; - - codec->read_buf[0] = 0; - codec->read_len = 0; - - codec->write_buf[0] = 0; - codec->write_len = 0; - - /* Finally, populate vtable */ - codec->base.handler.destroy = hubbub_utf8_codec_destroy; - codec->base.handler.encode = hubbub_utf8_codec_encode; - codec->base.handler.decode = hubbub_utf8_codec_decode; - codec->base.handler.reset = hubbub_utf8_codec_reset; - - return (hubbub_charsetcodec *) codec; -} - -/** - * Destroy a utf8 codec - * - * \param codec The codec to destroy - */ -void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec) -{ - UNUSED(codec); -} - -/** - * Encode a chunk of UCS4 data into utf8 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read. Any remaining output for the character will be buffered by the - * codec for writing on the next call. This buffered data is post-filtering, - * so will not be refiltered on the next call. - * - * In the case of the filter function failing, ::source will point _at_ the - * last input character read; nothing will be written or buffered for the - * failed character. It is up to the client to fix the cause of the failure - * and retry the encoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately before the location pointed to by - * ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; - uint32_t ucs4; - uint32_t *towrite; - size_t towritelen; - hubbub_error error; - - /* Process any outstanding characters from the previous call */ - if (c->write_len > 0) { - uint32_t *pwrite = c->write_buf; - uint8_t buf[6]; - size_t len; - - while (c->write_len > 0) { - error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len); - if (error != HUBBUB_OK) - abort(); - - if (*destlen < len) { - /* Insufficient output buffer space */ - for (len = 0; len < c->write_len; len++) - c->write_buf[len] = pwrite[len]; - - return HUBBUB_NOMEM; - } - - memcpy(*dest, buf, len); - - *dest += len; - *destlen -= len; - - pwrite++; - c->write_len--; - } - } - - /* Now process the characters for this call */ - while (*sourcelen > 0) { - ucs4 = ntohl(*((uint32_t *) (void *) *source)); - towrite = &ucs4; - towritelen = 1; - - /* Run character we're about to output through the - * registered filter, so it can replace it. */ - if (c->base.filter != NULL) { - error = c->base.filter(ucs4, - &towrite, &towritelen, - c->base.filter_pw); - if (error != HUBBUB_OK) - return error; - } - - /* Output current characters */ - while (towritelen > 0) { - uint8_t buf[6]; - size_t len; - - error = hubbub_utf8_from_ucs4(towrite[0], buf, &len); - if (error != HUBBUB_OK) - abort(); - - if (*destlen < len) { - /* Insufficient output space */ - if (towritelen >= WRITE_BUFSIZE) - abort(); - - c->write_len = towritelen; - - /* Copy pending chars to save area, for - * processing next call. */ - for (len = 0; len < towritelen; len++) - c->write_buf[len] = towrite[len]; - - /* Claim character we've just buffered, - * so it's not reprocessed */ - *source += 4; - *sourcelen -= 4; - - return HUBBUB_NOMEM; - } - - memcpy(*dest, buf, len); - - *dest += len; - *destlen -= len; - - towrite++; - towritelen--; - } - - *source += 4; - *sourcelen -= 4; - } - - return HUBBUB_OK; -} - -/** - * Decode a chunk of utf8 data into UCS4 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately at or before the location pointed - * to by ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * If STRICT error handling is configured and an illegal sequence is split - * over two calls, then _INVALID will be returned from the second call, - * but ::source will point mid-way through the invalid sequence (i.e. it - * will be unmodified over the second call). In addition, the internal - * incomplete-sequence buffer will be emptied, such that subsequent calls - * will progress, rather than re-evaluating the same invalid sequence. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - * - * Call this with a source length of 0 to flush the output buffer. - */ -hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; - hubbub_error error; - - if (c->read_len > 0) { - /* Output left over from last decode */ - uint32_t *pread = c->read_buf; - - while (c->read_len > 0 && *destlen >= c->read_len * 4) { - *((uint32_t *) (void *) *dest) = htonl(pread[0]); - - *dest += 4; - *destlen -= 4; - - pread++; - c->read_len--; - } - - if (*destlen < c->read_len * 4) { - /* Ran out of output buffer */ - size_t i; - - /* Shuffle remaining output down */ - for (i = 0; i < c->read_len; i++) - c->read_buf[i] = pread[i]; - - return HUBBUB_NOMEM; - } - } - - if (c->inval_len > 0) { - /* The last decode ended in an incomplete sequence. - * Fill up inval_buf with data from the start of the - * new chunk and process it. */ - uint8_t *in = c->inval_buf; - size_t ol = c->inval_len; - size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); - size_t orig_l = l; - - memcpy(c->inval_buf + ol, *source, l); - - l += c->inval_len; - - error = hubbub_utf8_codec_read_char(c, - (const uint8_t **) &in, &l, dest, destlen); - if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { - return error; - } - - /* And now, fix up source pointers */ - *source += max((signed) (orig_l - l), 0); - *sourcelen -= max((signed) (orig_l - l), 0); - - /* Failed to resolve an incomplete character and - * ran out of buffer space. No recovery strategy - * possible, so explode everywhere. */ - if ((orig_l + ol) - l == 0) - abort(); - - /* Report memory exhaustion case from above */ - if (error != HUBBUB_OK) - return error; - } - - /* Finally, the "normal" case; process all outstanding characters */ - while (*sourcelen > 0) { - error = hubbub_utf8_codec_read_char(c, - source, sourcelen, dest, destlen); - if (error != HUBBUB_OK) { - return error; - } - } - - return HUBBUB_OK; -} - -/** - * Clear a utf8 codec's encoding state - * - * \param codec The codec to reset - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec) -{ - hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; - - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - c->read_buf[0] = 0; - c->read_len = 0; - - c->write_buf[0] = 0; - c->write_len = 0; - - return HUBBUB_OK; -} - - -/** - * Read a character from the UTF-8 to UCS4 (big endian) - * - * \param c The codec - * \param source Pointer to pointer to source buffer (updated on exit) - * \param sourcelen Pointer to length of source buffer (updated on exit) - * \param dest Pointer to pointer to output buffer (updated on exit) - * \param destlen Pointer to length of output buffer (updated on exit) - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - uint32_t ucs4; - size_t sucs4; - hubbub_error error; - - /* Convert a single character */ - error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4); - if (error == HUBBUB_OK) { - /* Read a character */ - error = hubbub_utf8_codec_filter_decoded_char(c, - ucs4, dest, destlen); - if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { - /* filter function succeeded; update source pointers */ - *source += sucs4; - *sourcelen -= sucs4; - } - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - return error; - } else if (error == HUBBUB_NEEDDATA) { - /* Incomplete input sequence */ - if (*sourcelen > INVAL_BUFSIZE) - abort(); - - memmove(c->inval_buf, (char *) *source, *sourcelen); - c->inval_buf[*sourcelen] = '\0'; - c->inval_len = *sourcelen; - - *source += *sourcelen; - *sourcelen = 0; - - return HUBBUB_OK; - } else if (error == HUBBUB_INVALID) { - /* Illegal input sequence */ - uint32_t nextchar; - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - /* Strict errormode; simply flag invalid character */ - if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { - return HUBBUB_INVALID; - } - - /* Find next valid UTF-8 sequence. - * We're processing client-provided data, so let's - * be paranoid about its validity. */ - error = hubbub_utf8_next_paranoid(*source, *sourcelen, - 0, &nextchar); - if (error != HUBBUB_OK) { - if (error == HUBBUB_NEEDDATA) { - /* Need more data to be sure */ - if (*sourcelen > INVAL_BUFSIZE) - abort(); - - memmove(c->inval_buf, (char *) *source, - *sourcelen); - c->inval_buf[*sourcelen] = '\0'; - c->inval_len = *sourcelen; - - *source += *sourcelen; - *sourcelen = 0; - - nextchar = 0; - } else { - return error; - } - } - - /* output U+FFFD and continue processing. */ - error = hubbub_utf8_codec_filter_decoded_char(c, - 0xFFFD, dest, destlen); - if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { - /* filter function succeeded; update source pointers */ - *source += nextchar; - *sourcelen -= nextchar; - } - - return error; - } - - return HUBBUB_OK; -} - -/** - * Feed a UCS4 character through the registered filter and output the result - * - * \param c Codec to use - * \param ucs4 UCS4 character (host endian) - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to output buffer length - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * as a result of the failure of the - * client-provided filter function. - */ -hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen) -{ - if (c->base.filter != NULL) { - uint32_t *rep; - size_t replen; - hubbub_error error; - - error = c->base.filter(ucs4, &rep, &replen, - c->base.filter_pw); - if (error != HUBBUB_OK) { - return error; - } - - while (replen > 0 && *destlen >= replen * 4) { - *((uint32_t *) (void *) *dest) = htonl(*rep); - - *dest += 4; - *destlen -= 4; - - rep++; - replen--; - } - - if (*destlen < replen * 4) { - /* Run out of output buffer */ - size_t i; - - /* Buffer remaining output */ - c->read_len = replen; - - for (i = 0; i < replen; i++) { - c->read_buf[i] = rep[i]; - } - - return HUBBUB_NOMEM; - } - - } else { - if (*destlen < 4) { - /* Run out of output buffer */ - c->read_len = 1; - c->read_buf[0] = ucs4; - - return HUBBUB_NOMEM; - } - - *((uint32_t *) (void *) *dest) = htonl(ucs4); - *dest += 4; - *destlen -= 4; - } - - return HUBBUB_OK; -} - - -const hubbub_charsethandler hubbub_utf8_codec_handler = { - hubbub_utf8_codec_handles_charset, - hubbub_utf8_codec_create -}; Index: src/charset/aliases.h =================================================================== --- src/charset/aliases.h (revision 4665) +++ src/charset/aliases.h (working copy) @@ -1,42 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#ifndef hubbub_charset_aliases_h_ -#define hubbub_charset_aliases_h_ - -#include - -#include -#include - -typedef struct hubbub_aliases_canon { - struct hubbub_aliases_canon *next; - uint16_t mib_enum; - uint16_t name_len; - char name[1]; -} hubbub_aliases_canon; - -/* Load encoding aliases from file */ -hubbub_error hubbub_aliases_create(const char *filename, - hubbub_alloc alloc, void *pw); -/* Destroy encoding aliases */ -void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw); - -/* Convert an encoding alias to a MIB enum value */ -uint16_t hubbub_mibenum_from_name(const char *alias, size_t len); -/* Convert a MIB enum value into an encoding alias */ -const char *hubbub_mibenum_to_name(uint16_t mibenum); - -/* Canonicalise an alias name */ -hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, - size_t len); - -#ifndef NDEBUG -void hubbub_aliases_dump(void); -#endif - -#endif Index: src/charset/detect.c =================================================================== --- src/charset/detect.c (revision 4665) +++ src/charset/detect.c (working copy) @@ -8,12 +8,15 @@ #include #include -#include "charset/aliases.h" +#include + +#include + #include "utils/utils.h" #include "detect.h" -static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); +static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len); static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end); @@ -27,31 +30,31 @@ /** * Extract a charset from a chunk of data * - * \param data Pointer to pointer to buffer containing data - * \param len Pointer to buffer length - * \param mibenum Pointer to location to store MIB enum representing charset - * \param source Pointer to location to receive charset source - * \return HUBBUB_OK on success, appropriate error otherwise + * \param data Pointer to buffer containing data + * \param len Buffer length + * \param mibenum Pointer to location containing current MIB enum + * \param source Pointer to location containint current charset source + * \return PARSERUTILS_OK on success, appropriate error otherwise * - * The data pointer and length will be modified by this function if - * a byte order mark is encountered at the start of the buffer. The updated - * data pointer will point to the first byte in the buffer after the BOM. - * The length will be modified appropriately. + * ::mibenum and ::source will be updated on exit * * The larger a chunk of data fed to this routine, the better, as it allows * charset autodetection access to a larger dataset for analysis. */ -hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, - uint16_t *mibenum, hubbub_charset_source *source) +parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source) { uint16_t charset = 0; - if (data == NULL || *data == NULL || len == NULL || - mibenum == NULL || source == NULL) - return HUBBUB_BADPARM; + if (data == NULL || mibenum == NULL || source == NULL) + return PARSERUTILS_BADPARM; + /* If the source is dictated, there's nothing for us to do */ + if (*source == HUBBUB_CHARSET_DICTATED) + return PARSERUTILS_OK; + /* We need at least 4 bytes of data */ - if (*len < 4) + if (len < 4) goto default_encoding; /* First, look for a BOM */ @@ -60,21 +63,21 @@ *mibenum = charset; *source = HUBBUB_CHARSET_DOCUMENT; - return HUBBUB_OK; + return PARSERUTILS_OK; } /* No BOM was found, so we must look for a meta charset within * the document itself. */ - charset = hubbub_charset_scan_meta(*data, *len); + charset = hubbub_charset_scan_meta(data, len); if (charset != 0) { /* ISO-8859-1 becomes Windows-1252 */ - if (charset == hubbub_mibenum_from_name("ISO-8859-1", - SLEN("ISO-8859-1"))) { - charset = hubbub_mibenum_from_name("Windows-1252", - SLEN("Windows-1252")); + if (charset == parserutils_charset_mibenum_from_name( + "ISO-8859-1", SLEN("ISO-8859-1"))) { + charset = parserutils_charset_mibenum_from_name( + "Windows-1252", SLEN("Windows-1252")); /* Fallback to 8859-1 if that failed */ if (charset == 0) - charset = hubbub_mibenum_from_name( + charset = parserutils_charset_mibenum_from_name( "ISO-8859-1", SLEN("ISO-8859-1")); } @@ -94,23 +97,23 @@ * autodetection routines (or the fallback case if they * fail). */ - if (charset != hubbub_mibenum_from_name("UTF-16", + if (charset != parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16")) && - charset != hubbub_mibenum_from_name("UTF-16LE", - SLEN("UTF-16LE")) && - charset != hubbub_mibenum_from_name("UTF-16BE", - SLEN("UTF-16BE")) && - charset != hubbub_mibenum_from_name("UTF-32", - SLEN("UTF-32")) && - charset != hubbub_mibenum_from_name("UTF-32LE", - SLEN("UTF-32LE")) && - charset != hubbub_mibenum_from_name("UTF-32BE", - SLEN("UTF-32BE"))) { + charset != parserutils_charset_mibenum_from_name( + "UTF-16LE", SLEN("UTF-16LE")) && + charset != parserutils_charset_mibenum_from_name( + "UTF-16BE", SLEN("UTF-16BE")) && + charset != parserutils_charset_mibenum_from_name( + "UTF-32", SLEN("UTF-32")) && + charset != parserutils_charset_mibenum_from_name( + "UTF-32LE", SLEN("UTF-32LE")) && + charset != parserutils_charset_mibenum_from_name( + "UTF-32BE", SLEN("UTF-32BE"))) { *mibenum = charset; *source = HUBBUB_CHARSET_DOCUMENT; - return HUBBUB_OK; + return PARSERUTILS_OK; } } @@ -122,16 +125,16 @@ /* We failed to autodetect a charset, so use the default fallback */ default_encoding: - charset = hubbub_mibenum_from_name("Windows-1252", + charset = parserutils_charset_mibenum_from_name("Windows-1252", SLEN("Windows-1252")); if (charset == 0) - charset = hubbub_mibenum_from_name("ISO-8859-1", + charset = parserutils_charset_mibenum_from_name("ISO-8859-1", SLEN("ISO-8859-1")); *mibenum = charset; *source = HUBBUB_CHARSET_DEFAULT; - return HUBBUB_OK; + return PARSERUTILS_OK; } @@ -139,65 +142,38 @@ * Inspect the beginning of a buffer of data for the presence of a * UTF Byte Order Mark. * - * \param data Pointer to pointer to buffer containing data - * \param len Pointer to buffer length + * \param data Pointer to buffer containing data + * \param len Buffer length * \return MIB enum representing encoding described by BOM, or 0 if not found - * - * If a BOM is found, the data pointer will be modified to point to the first - * byte in the buffer after the BOM. The length will also be modified - * appropriately. */ -uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) +uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len) { - if (data == NULL || *data == NULL || len == NULL) + if (data == NULL) return 0; /* We require at least 4 bytes of data */ - if (*len < 4) + if (len < 4) return 0; -#define UTF32BOM_LEN (4) -#define UTF16BOM_LEN (2) -#define UTF8BOM_LEN (3) - - if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && - (*data)[2] == 0xFE && (*data)[3] == 0xFF) { - *data += UTF32BOM_LEN; - *len -= UTF32BOM_LEN; - - return hubbub_mibenum_from_name("UTF-32BE", + if (data[0] == 0x00 && data[1] == 0x00 && + data[2] == 0xFE && data[3] == 0xFF) { + return parserutils_charset_mibenum_from_name("UTF-32BE", SLEN("UTF-32BE")); - } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && - (*data)[2] == 0x00 && (*data)[3] == 0x00) { - *data += UTF32BOM_LEN; - *len -= UTF32BOM_LEN; - - return hubbub_mibenum_from_name("UTF-32LE", + } else if (data[0] == 0xFF && data[1] == 0xFE && + data[2] == 0x00 && data[3] == 0x00) { + return parserutils_charset_mibenum_from_name("UTF-32LE", SLEN("UTF-32LE")); - } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { - *data += UTF16BOM_LEN; - *len -= UTF16BOM_LEN; - - return hubbub_mibenum_from_name("UTF-16BE", + } else if (data[0] == 0xFE && data[1] == 0xFF) { + return parserutils_charset_mibenum_from_name("UTF-16BE", SLEN("UTF-16BE")); - } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { - *data += UTF16BOM_LEN; - *len -= UTF16BOM_LEN; - - return hubbub_mibenum_from_name("UTF-16LE", + } else if (data[0] == 0xFF && data[1] == 0xFE) { + return parserutils_charset_mibenum_from_name("UTF-16LE", SLEN("UTF-16LE")); - } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && - (*data)[2] == 0xBF) { - *data += UTF8BOM_LEN; - *len -= UTF8BOM_LEN; - - return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) { + return parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); } -#undef UTF32BOM_LEN -#undef UTF16BOM_LEN -#undef UTF8BOM_LEN - return 0; } @@ -223,7 +199,7 @@ * Search for a meta charset within a buffer of data * * \param data Pointer to buffer containing data - * \param len Length of buffer in data + * \param len Length of buffer * \return MIB enum representing encoding, or 0 if none found */ uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) @@ -344,7 +320,7 @@ while (valuelen > 0 && ISSPACE(value[valuelen - 1])) valuelen--; - mibenum = hubbub_mibenum_from_name( + mibenum = parserutils_charset_mibenum_from_name( (const char *) value, valuelen); if (mibenum != 0) return mibenum; @@ -478,8 +454,8 @@ /* 8 */ if (tentative != NULL) { - return hubbub_mibenum_from_name((const char *) tentative, - tentative_len); + return parserutils_charset_mibenum_from_name( + (const char *) tentative, tentative_len); } /* 9 */ Index: src/charset/codec_iconv.c =================================================================== --- src/charset/codec_iconv.c (revision 4665) +++ src/charset/codec_iconv.c (working copy) @@ -1,837 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/* This codec is hideously slow. Only use it as a last resort */ - -#include -#include -#include - -#include - -/* These two are for htonl / ntohl */ -#include -#include - -#include "charset/aliases.h" -#include "utils/utils.h" - -#include "codec_impl.h" - -/** - * A note on endianness: - * - * UCS4 is big-endian by default. Therefore, this codec reads and writes - * big-endian values. This is fine, and causes no problems. However, to - * make life easier for client-supplied filter code, character values passed - * to a filter and those read back from a filter are in host-endian. - * Therefore, we need to convert from big-endian to host-endian when passing - * characters to a filter and perform the reverse translation when reading - * characters back. - */ - -/** - * Iconv-based charset codec - */ -typedef struct hubbub_iconv_codec { - hubbub_charsetcodec base; /**< Base class */ - - iconv_t read_cd; /**< Iconv handle for reading */ -#define INVAL_BUFSIZE (32) - uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up - * incomplete input - * sequences */ - size_t inval_len; /**< Number of bytes in inval_buf */ - -#define READ_BUFSIZE (8) - uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial - * output sequences (decode) - */ - size_t read_len; /**< Number of characters in - * read_buf */ - - iconv_t write_cd; /**< Iconv handle for writing */ -#define WRITE_BUFSIZE (8) - uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial - * output sequences (encode) - */ - size_t write_len; /**< Number of characters in - * write_buf */ -} hubbub_iconv_codec; - - -static bool hubbub_iconv_codec_handles_charset(const char *charset); -static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, - hubbub_alloc alloc, void *pw); -static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec); -static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec); -static hubbub_error hubbub_iconv_codec_filter_decoded_char( - hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest, - size_t *destlen); -static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c); -static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen); -static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen); - -/** - * Determine whether this codec handles a specific charset - * - * \param charset Charset to test - * \return true if handleable, false otherwise - */ -bool hubbub_iconv_codec_handles_charset(const char *charset) -{ - iconv_t cd; - bool ret; - - cd = iconv_open("UCS-4", charset); - - ret = (cd != (iconv_t) -1); - - if (ret) - iconv_close(cd); - - return ret; -} - -/** - * Create an iconv-based codec - * - * \param charset The charset to read from / write to - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to codec, or NULL on failure - */ -hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, - hubbub_alloc alloc, void *pw) -{ - hubbub_iconv_codec *codec; - - codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw); - if (codec == NULL) - return NULL; - - codec->read_cd = iconv_open("UCS-4", charset); - if (codec->read_cd == (iconv_t) -1) { - alloc(codec, 0, pw); - return NULL; - } - - codec->write_cd = iconv_open(charset, "UCS-4"); - if (codec->write_cd == (iconv_t) -1) { - iconv_close(codec->read_cd); - alloc(codec, 0, pw); - return NULL; - } - - codec->inval_buf[0] = '\0'; - codec->inval_len = 0; - - codec->read_buf[0] = 0; - codec->read_len = 0; - - codec->write_buf[0] = 0; - codec->write_len = 0; - - /* Finally, populate vtable */ - codec->base.handler.destroy = hubbub_iconv_codec_destroy; - codec->base.handler.encode = hubbub_iconv_codec_encode; - codec->base.handler.decode = hubbub_iconv_codec_decode; - codec->base.handler.reset = hubbub_iconv_codec_reset; - - return (hubbub_charsetcodec *) codec; -} - -/** - * Destroy an iconv-based codec - * - * \param codec The codec to destroy - */ -void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec) -{ - hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; - - iconv_close(c->read_cd); - iconv_close(c->write_cd); - - return; -} - -/** - * Encode a chunk of UCS4 data into an iconv-based codec's charset - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read. Any remaining output for the character will be buffered by the - * codec for writing on the next call. This buffered data is post-filtering, - * so will not be refiltered on the next call. - * - * In the case of the filter function failing, ::source will point _at_ the - * last input character read; nothing will be written or buffered for the - * failed character. It is up to the client to fix the cause of the failure - * and retry the encoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately before the location pointed to by - * ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; - uint32_t ucs4; - const uint32_t *towrite; - size_t towritelen; - hubbub_error error; - - /* Process any outstanding characters from the previous call */ - if (c->write_len > 0) { - uint32_t *pwrite = c->write_buf; - - while (c->write_len > 0) { - error = hubbub_iconv_codec_write_char(c, pwrite[0], - dest, destlen); - if (error != HUBBUB_OK) { - /* Copy outstanding chars down, skipping - * invalid one, if present, so as to avoid - * reprocessing the invalid character */ - if (error == HUBBUB_INVALID) { - for (ucs4 = 1; ucs4 < c->write_len; - ucs4++) { - c->write_buf[ucs4] = - pwrite[ucs4]; - } - } - - return error; - } - - pwrite++; - c->write_len--; - } - } - - /* Now process the characters for this call */ - while (*sourcelen > 0) { - towrite = (const uint32_t *) (const void *) *source; - towritelen = 1; - ucs4 = *towrite; - - /* Run character we're about to output through the - * registered filter, so it can replace it, if it sees - * fit to do so */ - if (c->base.filter != NULL) { - uint32_t *replacement; - - error = c->base.filter(ntohl(ucs4), - &replacement, &towritelen, - c->base.filter_pw); - if (error != HUBBUB_OK) { - /* Don't eat character -- filter failed, - * so nothing gets written or buffered. - * It's up to the client to ensure that - * the filter works in the case where it - * reprocesses this character after the - * fault is fixed up. */ - - return error; - } - - /* Convert filter output to big endian UCS4 */ - for (ucs4 = 0; ucs4 < towritelen; ucs4++) { - replacement[ucs4] = htonl(replacement[ucs4]); - } - - towrite = (const uint32_t *) replacement; - } - - /* Output current character(s) */ - while (towritelen > 0) { - error = hubbub_iconv_codec_write_char(c, towrite[0], - dest, destlen); - - if (error != HUBBUB_OK) { - ucs4 = (error == HUBBUB_INVALID) ? 1 : 0; - - if (towritelen - ucs4 >= WRITE_BUFSIZE) - abort(); - - c->write_len = towritelen - ucs4; - - /* Copy pending chars to save area, for - * processing next call; skipping invalid - * character, if present, so it's not - * reprocessed. */ - for (; ucs4 < towritelen; ucs4++) { - c->write_buf[ucs4] = towrite[ucs4]; - } - - /* Claim character we've just buffered, - * so it's not repreocessed */ - *source += 4; - *sourcelen -= 4; - - return error; - } - - towrite++; - towritelen--; - } - - *source += 4; - *sourcelen -= 4; - } - - return HUBBUB_OK; -} - -/** - * Decode a chunk of data in an iconv-based codec's charset into UCS4 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * Note that, if failure occurs whilst attempting to write any output - * buffered by the last call, then ::source and ::sourcelen will remain - * unchanged (as nothing more has been read). - * - * There is no way to determine the output character which caused a - * failure (as it may be one in a filter-injected replacement sequence). - * It is, however, possible to determine which source character caused it - * (this being the character immediately at or before the location pointed - * to by ::source on exit). - * - * [I.e. the process of filtering results in a potential one-to-many mapping - * between source characters and output characters, and identification of - * individual output characters is impossible.] - * - * If STRICT error handling is configured and an illegal sequence is split - * over two calls, then _INVALID will be returned from the second call, - * but ::source will point mid-way through the invalid sequence (i.e. it - * will be unmodified over the second call). In addition, the internal - * incomplete-sequence buffer will be emptied, such that subsequent calls - * will progress, rather than re-evaluating the same invalid sequence. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - * - * Call this with a source length of 0 to flush the output buffer. - */ -hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; - hubbub_error error; - - if (c->read_len > 0) { - /* Output left over from last decode - * Attempt to finish this here */ - uint32_t *pread = c->read_buf; - - while (c->read_len > 0 && *destlen >= c->read_len * 4) { - *((uint32_t *) (void *) *dest) = pread[0]; - - *dest += 4; - *destlen -= 4; - - pread++; - c->read_len--; - } - - if (*destlen < c->read_len * 4) { - /* Run out of output buffer */ - size_t i; - - /* Shuffle remaining output down */ - for (i = 0; i < c->read_len; i++) { - c->read_buf[i] = pread[i]; - } - - return HUBBUB_NOMEM; - } - } - - if (c->inval_len > 0) { - /* The last decode ended in an incomplete sequence. - * Fill up inval_buf with data from the start of the - * new chunk and process it. */ - uint8_t *in = c->inval_buf; - size_t ol = c->inval_len; - size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); - size_t orig_l = l; - - memcpy(c->inval_buf + ol, *source, l); - - l += c->inval_len; - - error = hubbub_iconv_codec_read_char(c, - (const uint8_t **) &in, &l, dest, destlen); - if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { - return error; - } - - - /* And now, fix everything up so the normal processing - * does the right thing. */ - *source += max((signed) (orig_l - l), 0); - *sourcelen -= max((signed) (orig_l - l), 0); - - /* Failed to resolve an incomplete character and - * ran out of buffer space. No recovery strategy - * possible, so explode everywhere. */ - if ((orig_l + ol) - l == 0) - abort(); - - /* Handle memry exhaustion case from above */ - if (error != HUBBUB_OK) - return error; - } - - while (*sourcelen > 0) { - error = hubbub_iconv_codec_read_char(c, - source, sourcelen, dest, destlen); - if (error != HUBBUB_OK) { - return error; - } - } - - return HUBBUB_OK; -} - -/** - * Clear an iconv-based codec's encoding state - * - * \param codec The codec to reset - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec) -{ - hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; - - iconv(c->read_cd, NULL, NULL, NULL, NULL); - iconv(c->write_cd, NULL, NULL, NULL, NULL); - - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - c->read_buf[0] = 0; - c->read_len = 0; - - c->write_buf[0] = 0; - c->write_len = 0; - - return HUBBUB_OK; -} - -/** - * Feed a UCS4 character through the registered filter and output the result - * - * \param c Codec to use - * \param ucs4 UCS4 character (big endian) - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to output buffer length - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * as a result of the failure of the - * client-provided filter function. - */ -hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen) -{ - if (c->base.filter != NULL) { - uint32_t *rep; - size_t replen; - hubbub_error error; - - error = c->base.filter(ntohl(ucs4), &rep, &replen, - c->base.filter_pw); - if (error != HUBBUB_OK) { - return error; - } - - while (replen > 0 && *destlen >= replen * 4) { - *((uint32_t *) (void *) *dest) = htonl(*rep); - - *dest += 4; - *destlen -= 4; - - rep++; - replen--; - } - - if (*destlen < replen * 4) { - /* Run out of output buffer */ - size_t i; - - /* Buffer remaining output */ - c->read_len = replen; - - for (i = 0; i < replen; i++) { - c->read_buf[i] = htonl(rep[i]); - } - - return HUBBUB_NOMEM; - } - - } else { - if (*destlen < 4) { - /* Run out of output buffer */ - - c->read_len = 1; - c->read_buf[0] = ucs4; - - return HUBBUB_NOMEM; - } - - *((uint32_t *) (void *) *dest) = ucs4; - *dest += 4; - *destlen -= 4; - } - - return HUBBUB_OK; -} - -/** - * Detect if a codec's charset is Unicode capable - * - * \param c Codec to consider - * \return true if a Unicode variant, false otherwise - */ -bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c) -{ - static uint16_t ucs4; - static uint16_t ucs2; - static uint16_t utf8; - static uint16_t utf16; - static uint16_t utf16be; - static uint16_t utf16le; - static uint16_t utf32; - static uint16_t utf32be; - static uint16_t utf32le; - - if (ucs4 == 0) { - ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4")); - ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2")); - utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); - utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); - utf16be = hubbub_mibenum_from_name("UTF-16BE", - SLEN("UTF-16BE")); - utf16le = hubbub_mibenum_from_name("UTF-16LE", - SLEN("UTF-16LE")); - utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")); - utf32be = hubbub_mibenum_from_name("UTF-32BE", - SLEN("UTF-32BE")); - utf32le = hubbub_mibenum_from_name("UTF-32LE", - SLEN("UTF-32LE")); - } - - return (c->base.mibenum == ucs4 || - c->base.mibenum == ucs2 || - c->base.mibenum == utf8 || - c->base.mibenum == utf16 || - c->base.mibenum == utf16be || - c->base.mibenum == utf16le || - c->base.mibenum == utf32 || - c->base.mibenum == utf32be || - c->base.mibenum == utf32le); -} - -/** - * Read a character from the codec's native charset to UCS4 (big endian) - * - * \param c The codec - * \param source Pointer to pointer to source buffer (updated on exit) - * \param sourcelen Pointer to length of source buffer (updated on exit) - * \param dest Pointer to pointer to output buffer (updated on exit) - * \param destlen Pointer to length of output buffer (updated on exit) - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if a character cannot be represented and the - * codec's error handling mode is set to STRICT, - * as a result of the failure of the - * client-provided filter function. - * - * On exit, ::source will point immediately _after_ the last input character - * read, if the result is _OK or _NOMEM. Any remaining output for the - * character will be buffered by the codec for writing on the next call. - * This buffered data is post-filtering, so will not be refiltered on the - * next call. - * - * In the case of the result being _INVALID or the filter function failing, - * ::source will point _at_ the last input character read; nothing will be - * written or buffered for the failed character. It is up to the client to - * fix the cause of the failure and retry the decoding process. - * - * ::sourcelen will be reduced appropriately on exit. - * - * ::dest will point immediately _after_ the last character written. - * - * ::destlen will be reduced appropriately on exit. - */ -hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - size_t iconv_ret; - const uint8_t *origsrc = *source; - size_t origsrclen = *sourcelen; - uint32_t ucs4; - uint8_t *pucs4 = (uint8_t *) &ucs4; - size_t sucs4 = 4; - hubbub_error error; - - /* Use iconv to convert a single character - * Side effect: Updates *source to point at next input - * character and *sourcelen to reflect reduced input length - */ - iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, - (char **) (void *) &pucs4, &sucs4); - - if (iconv_ret != (size_t) -1 || - (*source != origsrc && sucs4 == 0)) { - /* Read a character */ - error = hubbub_iconv_codec_filter_decoded_char(c, - ucs4, dest, destlen); - if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { - /* filter function failed; restore source pointers */ - *source = origsrc; - *sourcelen = origsrclen; - } - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - return error; - } else if (errno == E2BIG) { - /* Should never happen */ - abort(); - } else if (errno == EINVAL) { - /* Incomplete input sequence */ - if (*sourcelen > INVAL_BUFSIZE) - abort(); - - memmove(c->inval_buf, (const char *) *source, *sourcelen); - c->inval_buf[*sourcelen] = '\0'; - c->inval_len = *sourcelen; - - *source += *sourcelen; - *sourcelen = 0; - - return HUBBUB_OK; - } else if (errno == EILSEQ) { - /* Illegal input sequence */ - bool found = false; - const uint8_t *oldsrc; - size_t oldsrclen; - - /* Clear inval buffer */ - c->inval_buf[0] = '\0'; - c->inval_len = 0; - - /* Strict errormode; simply flag invalid character */ - if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { - /* restore source pointers */ - *source = origsrc; - *sourcelen = origsrclen; - - return HUBBUB_INVALID; - } - - /* Ok, this becomes problematic. The iconv API here - * is particularly unhelpful; *source will point at - * the _start_ of the illegal sequence. This means - * that we must find the end of the sequence */ - - /* Search for the start of the next valid input - * sequence (or the end of the input stream) */ - while (*sourcelen > 1) { - pucs4 = (uint8_t *) &ucs4; - sucs4 = 4; - - (*source)++; - (*sourcelen)--; - - oldsrc = *source; - oldsrclen = *sourcelen; - - iconv_ret = iconv(c->read_cd, - (char **) source, sourcelen, - (char **) (void *) &pucs4, &sucs4); - if (iconv_ret != (size_t) -1 || errno != EILSEQ) { - found = true; - break; - } - } - - if (found) { - /* Found start of next valid sequence */ - *source = oldsrc; - *sourcelen = oldsrclen; - } else { - /* Not found - skip last byte in buffer */ - (*source)++; - (*sourcelen)--; - - if (*sourcelen != 0) - abort(); - } - - /* output U+FFFD and continue processing. */ - error = hubbub_iconv_codec_filter_decoded_char(c, - htonl(0xFFFD), dest, destlen); - if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { - /* filter function failed; restore source pointers */ - *source = origsrc; - *sourcelen = origsrclen; - } - - return error; - } - - return HUBBUB_OK; -} - -/** - * Write a UCS4 character in a codec's native charset - * - * \param c The codec - * \param ucs4 The UCS4 character to write (big endian) - * \param dest Pointer to pointer to output buffer (updated on exit) - * \param destlen Pointer to length of output buffer (updated on exit) - * \return HUBBUB_OK on success, - * HUBBUB_NOMEM if output buffer is too small, - * HUBBUB_INVALID if character cannot be represented and the - * codec's error handling mode is set to STRICT. - */ -hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, - uint32_t ucs4, uint8_t **dest, size_t *destlen) -{ - size_t iconv_ret; - uint8_t *pucs4 = (uint8_t *) &ucs4; - size_t sucs4 = 4; - uint8_t *origdest = *dest; - - iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, - &sucs4, (char **) dest, destlen); - - if (iconv_ret == (size_t) -1 && errno == E2BIG) { - /* Output buffer is too small */ - return HUBBUB_NOMEM; - } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { - /* Illegal multibyte sequence */ - /* This should never happen */ - abort(); - } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { - /* Incomplete input character */ - /* This should never happen */ - abort(); - } else if (*dest == origdest) { - /* Nothing was output */ - switch (c->base.errormode) { - case HUBBUB_CHARSETCODEC_ERROR_STRICT: - return HUBBUB_INVALID; - - case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT: - /** \todo transliteration */ - case HUBBUB_CHARSETCODEC_ERROR_LOOSE: - { - pucs4 = (uint8_t *) &ucs4; - sucs4 = 4; - - ucs4 = hubbub_iconv_codec_is_unicode(c) - ? htonl(0xFFFD) : htonl(0x3F); - - iconv_ret = iconv(c->write_cd, - (char **) (void *) &pucs4, &sucs4, - (char **) dest, destlen); - - if (iconv_ret == (size_t) -1 && errno == E2BIG) { - return HUBBUB_NOMEM; - } else if (iconv_ret == (size_t) -1 && - errno == EILSEQ) { - /* Illegal multibyte sequence */ - /* This should never happen */ - abort(); - } else if (iconv_ret == (size_t) -1 && - errno == EINVAL) { - /* Incomplete input character */ - /* This should never happen */ - abort(); - } - } - break; - } - } - - return HUBBUB_OK; -} - -const hubbub_charsethandler hubbub_iconv_codec_handler = { - hubbub_iconv_codec_handles_charset, - hubbub_iconv_codec_create -}; Index: src/charset/Makefile =================================================================== --- src/charset/Makefile (revision 4665) +++ src/charset/Makefile (working copy) @@ -32,7 +32,7 @@ d := $(DIR) # Sources -SRCS_$(d) := aliases.c codec.c codec_iconv.c codec_utf8.c codec_utf16.c detect.c +SRCS_$(d) := detect.c # Append to sources for component SOURCES += $(addprefix $(d), $(SRCS_$(d))) Index: src/charset/codec.c =================================================================== --- src/charset/codec.c (revision 4665) +++ src/charset/codec.c (working copy) @@ -1,188 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#include - -#include "charset/aliases.h" - -#include "codec_impl.h" - -extern hubbub_charsethandler hubbub_iconv_codec_handler; -extern hubbub_charsethandler hubbub_utf8_codec_handler; -extern hubbub_charsethandler hubbub_utf16_codec_handler; - -static hubbub_charsethandler *handler_table[] = { - &hubbub_utf8_codec_handler, - &hubbub_utf16_codec_handler, - &hubbub_iconv_codec_handler, - NULL, -}; - -/** - * Create a charset codec - * - * \param charset Target charset - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to codec instance, or NULL on failure - */ -hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, - hubbub_alloc alloc, void *pw) -{ - hubbub_charsetcodec *codec; - hubbub_charsethandler **handler; - const hubbub_aliases_canon * canon; - - if (charset == NULL || alloc == NULL) - return NULL; - - /* Canonicalise charset name. */ - canon = hubbub_alias_canonicalise(charset, strlen(charset)); - if (canon == NULL) - return NULL; - - /* Search for handler class */ - for (handler = handler_table; *handler != NULL; handler++) { - if ((*handler)->handles_charset(canon->name)) - break; - } - - /* None found */ - if ((*handler) == NULL) - return NULL; - - /* Instantiate class */ - codec = (*handler)->create(canon->name, alloc, pw); - if (codec == NULL) - return NULL; - - /* and initialise it */ - codec->mibenum = canon->mib_enum; - - codec->filter = NULL; - codec->filter_pw = NULL; - - codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE; - - codec->alloc = alloc; - codec->alloc_pw = pw; - - return codec; -} - -/** - * Destroy a charset codec - * - * \param codec The codec to destroy - */ -void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec) -{ - if (codec == NULL) - return; - - codec->handler.destroy(codec); - - codec->alloc(codec, 0, codec->alloc_pw); -} - -/** - * Configure a charset codec - * - * \param codec The codec to configure - * \parem type The codec option type to configure - * \param params Option-specific parameters - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, - hubbub_charsetcodec_opttype type, - hubbub_charsetcodec_optparams *params) -{ - if (codec == NULL || params == NULL) - return HUBBUB_BADPARM; - - switch (type) { - case HUBBUB_CHARSETCODEC_FILTER_FUNC: - codec->filter = params->filter_func.filter; - codec->filter_pw = params->filter_func.pw; - break; - - case HUBBUB_CHARSETCODEC_ERROR_MODE: - codec->errormode = params->error_mode.mode; - break; - } - - return HUBBUB_OK; -} - -/** - * Encode a chunk of UCS4 data into a codec's charset - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, appropriate error otherwise. - * - * source, sourcelen, dest and destlen will be updated appropriately on exit - */ -hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - if (codec == NULL || source == NULL || *source == NULL || - sourcelen == NULL || dest == NULL || *dest == NULL || - destlen == NULL) - return HUBBUB_BADPARM; - - return codec->handler.encode(codec, source, sourcelen, dest, destlen); -} - -/** - * Decode a chunk of data in a codec's charset into UCS4 - * - * \param codec The codec to use - * \param source Pointer to pointer to source data - * \param sourcelen Pointer to length (in bytes) of source data - * \param dest Pointer to pointer to output buffer - * \param destlen Pointer to length (in bytes) of output buffer - * \return HUBBUB_OK on success, appropriate error otherwise. - * - * source, sourcelen, dest and destlen will be updated appropriately on exit - * - * Call this with a source length of 0 to flush any buffers. - */ -hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, - const uint8_t **source, size_t *sourcelen, - uint8_t **dest, size_t *destlen) -{ - if (codec == NULL || source == NULL || *source == NULL || - sourcelen == NULL || dest == NULL || *dest == NULL || - destlen == NULL) - return HUBBUB_BADPARM; - - return codec->handler.decode(codec, source, sourcelen, dest, destlen); -} - -/** - * Clear a charset codec's encoding state - * - * \param codec The codec to reset - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec) -{ - if (codec == NULL) - return HUBBUB_BADPARM; - - /* Reset filter */ - if (codec->filter) - codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL); - - return codec->handler.reset(codec); -} - Index: src/charset/detect.h =================================================================== --- src/charset/detect.h (revision 4665) +++ src/charset/detect.h (working copy) @@ -10,13 +10,11 @@ #include -#include -#include -#include +#include /* Extract a charset from a chunk of data */ -hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, - uint16_t *mibenum, hubbub_charset_source *source); +parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source); #endif Index: src/parser.c =================================================================== --- src/parser.c (revision 4665) +++ src/parser.c (working copy) @@ -9,6 +9,7 @@ #include +#include "charset/detect.h" #include "tokeniser/tokeniser.h" #include "treebuilder/treebuilder.h" @@ -45,8 +46,9 @@ if (parser == NULL) return NULL; - parser->stream = parserutils_inputstream_create(enc, 0, NULL, - alloc, pw); + parser->stream = parserutils_inputstream_create(enc, + enc != NULL ? HUBBUB_CHARSET_DICTATED : HUBBUB_CHARSET_UNKNOWN, + hubbub_charset_extract, alloc, pw); if (parser->stream == NULL) { alloc(parser, 0, pw); return NULL; @@ -263,26 +265,3 @@ return parserutils_inputstream_read_charset(parser->stream, source); } -#if 0 -/** - * Claim ownership of the document buffer - * - * \param parser Parser whose buffer to claim - * \param buffer Pointer to location to receive buffer pointer - * \param len Pointer to location to receive byte length of buffer - * \return HUBBUB_OK on success, appropriate error otherwise. - * - * Once the buffer has been claimed by a client, the parser disclaims - * all ownership rights (and invalidates any internal references it may have - * to the buffer). Therefore, the only parser call which may be made - * after calling this function is to destroy the parser. - */ -hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser, - uint8_t **buffer, size_t *len) -{ - if (parser == NULL || buffer == NULL || len == NULL) - return HUBBUB_BADPARM; - - return hubbub_inputstream_claim_buffer(parser->stream, buffer, len); -} -#endif