libdatrie/datrie/alpha-map.c

597 lines
15 KiB
C

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* libdatrie - Double-Array Trie Library
* Copyright (C) 2006 Theppitak Karoonboonyanan <theppitak@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* alpha-map.c - map between character codes and trie alphabet
* Created: 2006-08-19
* Author: Theppitak Karoonboonyanan <theppitak@gmail.com>
*/
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include "alpha-map.h"
#include "alpha-map-private.h"
#include "trie-private.h"
#include "fileutils.h"
#include "trie-string.h"
/**
* @brief Alphabet string length
*
* @param str : the array of null-terminated AlphaChar string to measure
*
* @return the total characters in @a str.
*/
int
alpha_char_strlen (const AlphaChar *str)
{
const AlphaChar *p;
for (p = str; *p; p++)
;
return p - str;
}
/**
* @brief Compare alphabet strings
*
* @param str1, str2 : the arrays of null-terminated AlphaChar strings
* to compare
*
* @return negative if @a str1 < @a str2;
* 0 if @a str1 == @a str2;
* positive if @a str1 > @a str2
*
* Available since: 0.2.7
*/
int
alpha_char_strcmp (const AlphaChar *str1, const AlphaChar *str2)
{
while (*str1 && *str1 == *str2) {
str1++; str2++;
}
if (*str1 < *str2)
return -1;
if (*str1 > *str2)
return 1;
return 0;
}
/*------------------------------*
* PRIVATE DATA DEFINITONS *
*------------------------------*/
typedef struct _AlphaRange {
struct _AlphaRange *next;
AlphaChar begin;
AlphaChar end;
} AlphaRange;
struct _AlphaMap {
AlphaRange *first_range;
/* work area */
/* alpha-to-trie map */
AlphaChar alpha_begin;
AlphaChar alpha_end;
int alpha_map_sz;
TrieIndex *alpha_to_trie_map;
/* trie-to-alpha map */
int trie_map_sz;
AlphaChar *trie_to_alpha_map;
};
/*-----------------------------------*
* PRIVATE METHODS DECLARATIONS *
*-----------------------------------*/
static int alpha_map_get_total_ranges (const AlphaMap *alpha_map);
static int alpha_map_add_range_only (AlphaMap *alpha_map,
AlphaChar begin, AlphaChar end);
static int alpha_map_recalc_work_area (AlphaMap *alpha_map);
/*-----------------------------*
* METHODS IMPLEMENTAIONS *
*-----------------------------*/
#define ALPHAMAP_SIGNATURE 0xD9FCD9FC
/* AlphaMap Header:
* - INT32: signature
* - INT32: total ranges
*
* Ranges:
* - INT32: range begin
* - INT32: range end
*/
/**
* @brief Create new alphabet map
*
* @return a pointer to the newly created alphabet map, NULL on failure
*
* Create a new empty alphabet map. The map contents can then be added with
* alpha_map_add_range().
*
* The created object must be freed with alpha_map_free().
*/
AlphaMap *
alpha_map_new (void)
{
AlphaMap *alpha_map;
alpha_map = (AlphaMap *) malloc (sizeof (AlphaMap));
if (UNLIKELY (!alpha_map))
return NULL;
alpha_map->first_range = NULL;
/* work area */
alpha_map->alpha_begin = 0;
alpha_map->alpha_end = 0;
alpha_map->alpha_map_sz = 0;
alpha_map->alpha_to_trie_map = NULL;
alpha_map->trie_map_sz = 0;
alpha_map->trie_to_alpha_map = NULL;
return alpha_map;
}
/**
* @brief Create a clone of alphabet map
*
* @param a_map : the source alphabet map to clone
*
* @return a pointer to the alphabet map clone, NULL on failure
*
* The created object must be freed with alpha_map_free().
*/
AlphaMap *
alpha_map_clone (const AlphaMap *a_map)
{
AlphaMap *alpha_map;
AlphaRange *range;
alpha_map = alpha_map_new ();
if (UNLIKELY (!alpha_map))
return NULL;
for (range = a_map->first_range; range; range = range->next) {
if (alpha_map_add_range_only (alpha_map, range->begin, range->end) != 0)
goto exit_map_created;
}
if (alpha_map_recalc_work_area (alpha_map) != 0)
goto exit_map_created;
return alpha_map;
exit_map_created:
alpha_map_free (alpha_map);
return NULL;
}
/**
* @brief Free an alphabet map object
*
* @param alpha_map : the alphabet map object to free
*
* Destruct the @a alpha_map and free its allocated memory.
*/
void
alpha_map_free (AlphaMap *alpha_map)
{
AlphaRange *p, *q;
p = alpha_map->first_range;
while (p) {
q = p->next;
free (p);
p = q;
}
/* work area */
if (alpha_map->alpha_to_trie_map)
free (alpha_map->alpha_to_trie_map);
if (alpha_map->trie_to_alpha_map)
free (alpha_map->trie_to_alpha_map);
free (alpha_map);
}
AlphaMap *
alpha_map_fread_bin (FILE *file)
{
long save_pos;
uint32 sig;
int32 total, i;
AlphaMap *alpha_map;
/* check signature */
save_pos = ftell (file);
if (!file_read_int32 (file, (int32 *) &sig) || ALPHAMAP_SIGNATURE != sig)
goto exit_file_read;
alpha_map = alpha_map_new ();
if (UNLIKELY (!alpha_map))
goto exit_file_read;
/* read number of ranges */
if (!file_read_int32 (file, &total))
goto exit_map_created;
/* read character ranges */
for (i = 0; i < total; i++) {
int32 b, e;
if (!file_read_int32 (file, &b) || !file_read_int32 (file, &e))
goto exit_map_created;
alpha_map_add_range_only (alpha_map, b, e);
}
/* work area */
if (UNLIKELY (alpha_map_recalc_work_area (alpha_map) != 0))
goto exit_map_created;
return alpha_map;
exit_map_created:
alpha_map_free (alpha_map);
exit_file_read:
fseek (file, save_pos, SEEK_SET);
return NULL;
}
static int
alpha_map_get_total_ranges (const AlphaMap *alpha_map)
{
int n;
AlphaRange *range;
for (n = 0, range = alpha_map->first_range; range; range = range->next) {
++n;
}
return n;
}
int
alpha_map_fwrite_bin (const AlphaMap *alpha_map, FILE *file)
{
AlphaRange *range;
if (!file_write_int32 (file, ALPHAMAP_SIGNATURE) ||
!file_write_int32 (file, alpha_map_get_total_ranges (alpha_map)))
{
return -1;
}
for (range = alpha_map->first_range; range; range = range->next) {
if (!file_write_int32 (file, range->begin) ||
!file_write_int32 (file, range->end))
{
return -1;
}
}
return 0;
}
size_t
alpha_map_get_serialized_size (const AlphaMap *alpha_map)
{
int32 ranges_count = alpha_map_get_total_ranges (alpha_map);
return (
4 /* ALPHAMAP_SIGNATURE */
+ sizeof (ranges_count)
+ (sizeof (AlphaChar) * 2) * ranges_count /* range->begin, range->end */
);
}
void
alpha_map_serialize_bin (const AlphaMap *alpha_map, uint8 **ptr)
{
AlphaRange *range;
serialize_int32_be_incr (ptr, ALPHAMAP_SIGNATURE);
serialize_int32_be_incr (ptr, alpha_map_get_total_ranges (alpha_map));
for (range = alpha_map->first_range; range; range = range->next) {
serialize_int32_be_incr (ptr, range->begin);
serialize_int32_be_incr (ptr, range->end);
}
}
static int
alpha_map_add_range_only (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end)
{
AlphaRange *q, *r, *begin_node, *end_node;
if (begin > end)
return -1;
begin_node = end_node = 0;
/* Skip first ranges till 'begin' is covered */
for (q = 0, r = alpha_map->first_range;
r && r->begin <= begin;
q = r, r = r->next)
{
if (begin <= r->end) {
/* 'r' covers 'begin' -> take 'r' as beginning point */
begin_node = r;
break;
}
if (r->end + 1 == begin) {
/* 'begin' is next to 'r'-end
* -> extend 'r'-end to cover 'begin'
*/
r->end = begin;
begin_node = r;
break;
}
}
if (!begin_node && r && r->begin <= end + 1) {
/* ['begin', 'end'] overlaps into 'r'-begin
* or 'r' is next to 'end' if r->begin == end + 1
* -> extend 'r'-begin to include the range
*/
r->begin = begin;
begin_node = r;
}
/* Run upto the first range that exceeds 'end' */
while (r && r->begin <= end + 1) {
if (end <= r->end) {
/* 'r' covers 'end' -> take 'r' as ending point */
end_node = r;
} else if (r != begin_node) {
/* ['begin', 'end'] covers the whole 'r' -> remove 'r' */
if (q) {
q->next = r->next;
free (r);
r = q->next;
} else {
alpha_map->first_range = r->next;
free (r);
r = alpha_map->first_range;
}
continue;
}
q = r;
r = r->next;
}
if (!end_node && q && begin <= q->end) {
/* ['begin', 'end'] overlaps 'q' at the end
* -> extend 'q'-end to include the range
*/
q->end = end;
end_node = q;
}
if (begin_node && end_node) {
if (begin_node != end_node) {
/* Merge begin_node and end_node ranges together */
assert (begin_node->next == end_node);
begin_node->end = end_node->end;
begin_node->next = end_node->next;
free (end_node);
}
} else if (!begin_node && !end_node) {
/* ['begin', 'end'] overlaps with none of the ranges
* -> insert a new range
*/
AlphaRange *range = (AlphaRange *) malloc (sizeof (AlphaRange));
if (UNLIKELY (!range))
return -1;
range->begin = begin;
range->end = end;
/* insert it between 'q' and 'r' */
if (q) {
q->next = range;
} else {
alpha_map->first_range = range;
}
range->next = r;
}
return 0;
}
static int
alpha_map_recalc_work_area (AlphaMap *alpha_map)
{
AlphaRange *range;
/* free old existing map */
if (alpha_map->alpha_to_trie_map) {
free (alpha_map->alpha_to_trie_map);
alpha_map->alpha_to_trie_map = NULL;
}
if (alpha_map->trie_to_alpha_map) {
free (alpha_map->trie_to_alpha_map);
alpha_map->trie_to_alpha_map = NULL;
}
range = alpha_map->first_range;
if (range) {
const AlphaChar alpha_begin = range->begin;
int n_alpha, n_trie, i;
AlphaChar a;
TrieIndex trie_char;
alpha_map->alpha_begin = alpha_begin;
n_trie = 0;
for ( ;; ) {
n_trie += range->end - range->begin + 1;
if (!range->next)
break;
range = range->next;
}
if (n_trie < TRIE_CHAR_TERM) {
n_trie = TRIE_CHAR_TERM + 1;
} else {
n_trie++;
}
alpha_map->alpha_end = range->end;
alpha_map->alpha_map_sz = n_alpha = range->end - alpha_begin + 1;
alpha_map->alpha_to_trie_map
= (TrieIndex *) malloc (n_alpha * sizeof (TrieIndex));
if (UNLIKELY (!alpha_map->alpha_to_trie_map))
goto error_alpha_map_not_created;
for (i = 0; i < n_alpha; i++) {
alpha_map->alpha_to_trie_map[i] = TRIE_INDEX_MAX;
}
alpha_map->trie_map_sz = n_trie;
alpha_map->trie_to_alpha_map
= (AlphaChar *) malloc (n_trie * sizeof (AlphaChar));
if (UNLIKELY (!alpha_map->trie_to_alpha_map))
goto error_alpha_map_created;
trie_char = 0;
for (range = alpha_map->first_range; range; range = range->next) {
for (a = range->begin; a <= range->end; a++) {
if (TRIE_CHAR_TERM == trie_char)
trie_char++;
alpha_map->alpha_to_trie_map[a - alpha_begin] = trie_char;
alpha_map->trie_to_alpha_map[trie_char] = a;
trie_char++;
}
}
while (trie_char < n_trie) {
alpha_map->trie_to_alpha_map[trie_char++] = ALPHA_CHAR_ERROR;
}
alpha_map->trie_to_alpha_map[TRIE_CHAR_TERM] = 0;
}
return 0;
error_alpha_map_created:
free (alpha_map->alpha_to_trie_map);
alpha_map->alpha_to_trie_map = NULL;
error_alpha_map_not_created:
return -1;
}
/**
* @brief Add a range to alphabet map
*
* @param alpha_map : the alphabet map object
* @param begin : the first character of the range
* @param end : the last character of the range
*
* @return 0 on success, non-zero on failure
*
* Add a range of character codes from @a begin to @a end to the
* alphabet set.
*/
int
alpha_map_add_range (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end)
{
int res = alpha_map_add_range_only (alpha_map, begin, end);
if (res != 0)
return res;
return alpha_map_recalc_work_area (alpha_map);
}
TrieIndex
alpha_map_char_to_trie (const AlphaMap *alpha_map, AlphaChar ac)
{
TrieIndex alpha_begin;
if (UNLIKELY (0 == ac))
return TRIE_CHAR_TERM;
if (UNLIKELY (!alpha_map->alpha_to_trie_map))
return TRIE_INDEX_MAX;
alpha_begin = alpha_map->alpha_begin;
if (alpha_begin <= ac && ac <= alpha_map->alpha_end)
{
return alpha_map->alpha_to_trie_map[ac - alpha_begin];
}
return TRIE_INDEX_MAX;
}
AlphaChar
alpha_map_trie_to_char (const AlphaMap *alpha_map, TrieChar tc)
{
if (tc < alpha_map->trie_map_sz)
return alpha_map->trie_to_alpha_map[tc];
return ALPHA_CHAR_ERROR;
}
TrieChar *
alpha_map_char_to_trie_str (const AlphaMap *alpha_map, const AlphaChar *str)
{
TrieChar *trie_str, *p;
trie_str = (TrieChar *) malloc (alpha_char_strlen (str) + 1);
if (UNLIKELY (!trie_str))
return NULL;
for (p = trie_str; *str; p++, str++) {
TrieIndex tc = alpha_map_char_to_trie (alpha_map, *str);
if (TRIE_INDEX_MAX == tc)
goto error_str_allocated;
*p = (TrieChar) tc;
}
*p = TRIE_CHAR_TERM;
return trie_str;
error_str_allocated:
free (trie_str);
return NULL;
}
AlphaChar *
alpha_map_trie_to_char_str (const AlphaMap *alpha_map, const TrieChar *str)
{
AlphaChar *alpha_str, *p;
alpha_str = (AlphaChar *) malloc ((trie_char_strlen (str) + 1)
* sizeof (AlphaChar));
if (UNLIKELY (!alpha_str))
return NULL;
for (p = alpha_str; *str; p++, str++) {
*p = (AlphaChar) alpha_map_trie_to_char (alpha_map, *str);
}
*p = 0;
return alpha_str;
}
/*
vi:ts=4:ai:expandtab
*/