libdatrie/datrie/tail.c

580 lines
14 KiB
C
Raw Normal View History

2022-05-14 00:40:53 +08:00
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* libdatrie - Double-Array Trie Library
* Copyright (C) 2006 Theppitak Karoonboonyanan <theppitak@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* tail.c - trie tail for keeping suffixes
* Created: 2006-08-15
* Author: Theppitak Karoonboonyanan <theppitak@gmail.com>
*/
#include <string.h>
#include <stdlib.h>
#ifndef _MSC_VER /* for SIZE_MAX */
# include <stdint.h>
#endif
#include <stdio.h>
#include "tail.h"
#include "trie-private.h"
#include "fileutils.h"
2024-04-24 14:24:51 +08:00
#include "trie-string.h"
2022-05-14 00:40:53 +08:00
/*----------------------------------*
* INTERNAL TYPES DECLARATIONS *
*----------------------------------*/
/*-----------------------------------*
* PRIVATE METHODS DECLARATIONS *
*-----------------------------------*/
static TrieIndex tail_alloc_block (Tail *t);
static void tail_free_block (Tail *t, TrieIndex block);
/* ==================== BEGIN IMPLEMENTATION PART ==================== */
/*------------------------------------*
* INTERNAL TYPES IMPLEMENTATIONS *
*------------------------------------*/
/*------------------------------*
* PRIVATE DATA DEFINITONS *
*------------------------------*/
typedef struct {
TrieIndex next_free;
TrieData data;
TrieChar *suffix;
} TailBlock;
struct _Tail {
TrieIndex num_tails;
TailBlock *tails;
TrieIndex first_free;
};
/*-----------------------------*
* METHODS IMPLEMENTAIONS *
*-----------------------------*/
#define TAIL_SIGNATURE 0xDFFCDFFC
#define TAIL_START_BLOCKNO 1
/* Tail Header:
* INT32: signature
* INT32: pointer to first free slot
* INT32: number of tail blocks
*
* Tail Blocks:
* INT32: pointer to next free block (-1 for allocated blocks)
* INT32: data for the key
* INT16: length
* BYTES[length]: suffix string (no terminating '\0')
*/
/**
* @brief Create a new tail object
*
* Create a new empty tail object.
*/
Tail *
tail_new (void)
{
Tail *t;
t = (Tail *) malloc (sizeof (Tail));
if (UNLIKELY (!t))
return NULL;
t->first_free = 0;
t->num_tails = 0;
t->tails = NULL;
return t;
}
/**
* @brief Read tail data from file
*
* @param file : the file to read
*
* @return a pointer to the openned tail data, NULL on failure
*
* Read tail data from the opened file, starting from the current
* file pointer until the end of tail data block. On return, the
* file pointer is left at the position after the read block.
*/
Tail *
tail_fread (FILE *file)
{
long save_pos;
Tail *t;
TrieIndex i;
uint32 sig;
/* check signature */
save_pos = ftell (file);
if (!file_read_int32 (file, (int32 *) &sig) || TAIL_SIGNATURE != sig)
goto exit_file_read;
t = (Tail *) malloc (sizeof (Tail));
if (UNLIKELY (!t))
goto exit_file_read;
if (!file_read_int32 (file, &t->first_free) ||
!file_read_int32 (file, &t->num_tails))
{
goto exit_tail_created;
}
if (t->num_tails > SIZE_MAX / sizeof (TailBlock))
goto exit_tail_created;
t->tails = (TailBlock *) malloc (t->num_tails * sizeof (TailBlock));
if (UNLIKELY (!t->tails))
goto exit_tail_created;
for (i = 0; i < t->num_tails; i++) {
int16 length;
if (!file_read_int32 (file, &t->tails[i].next_free) ||
!file_read_int32 (file, &t->tails[i].data) ||
!file_read_int16 (file, &length))
{
goto exit_in_loop;
}
t->tails[i].suffix = (TrieChar *) malloc (length + 1);
if (UNLIKELY (!t->tails[i].suffix))
goto exit_in_loop;
if (length > 0) {
if (!file_read_chars (file, (char *)t->tails[i].suffix, length)) {
free (t->tails[i].suffix);
goto exit_in_loop;
}
}
2024-04-24 14:24:51 +08:00
t->tails[i].suffix[length] = TRIE_CHAR_TERM;
2022-05-14 00:40:53 +08:00
}
return t;
exit_in_loop:
while (i > 0) {
free (t->tails[--i].suffix);
}
free (t->tails);
exit_tail_created:
free (t);
exit_file_read:
fseek (file, save_pos, SEEK_SET);
return NULL;
}
/**
* @brief Free tail data
*
* @param t : the tail data
*
* @return 0 on success, non-zero on failure
*
* Free the given tail data.
*/
void
tail_free (Tail *t)
{
TrieIndex i;
if (t->tails) {
for (i = 0; i < t->num_tails; i++)
if (t->tails[i].suffix)
free (t->tails[i].suffix);
free (t->tails);
}
free (t);
}
/**
* @brief Write tail data
*
* @param t : the tail data
* @param file : the file to write to
*
* @return 0 on success, non-zero on failure
*
* Write tail data to the given @a file, starting from the current file
* pointer. On return, the file pointer is left after the tail data block.
*/
int
tail_fwrite (const Tail *t, FILE *file)
{
TrieIndex i;
if (!file_write_int32 (file, TAIL_SIGNATURE) ||
!file_write_int32 (file, t->first_free) ||
!file_write_int32 (file, t->num_tails))
{
return -1;
}
for (i = 0; i < t->num_tails; i++) {
int16 length;
if (!file_write_int32 (file, t->tails[i].next_free) ||
!file_write_int32 (file, t->tails[i].data))
{
return -1;
}
2024-04-24 14:24:51 +08:00
length = t->tails[i].suffix ? trie_char_strlen (t->tails[i].suffix)
2022-05-14 00:40:53 +08:00
: 0;
if (!file_write_int16 (file, length))
return -1;
if (length > 0 &&
!file_write_chars (file, (char *)t->tails[i].suffix, length))
{
return -1;
}
}
return 0;
}
2024-04-24 14:24:51 +08:00
size_t
tail_get_serialized_size (const Tail *t)
{
size_t static_count = (
sizeof (int32) /* sizeof (TAIL_SIGNATURE) */
+ sizeof (t->first_free)
+ sizeof (t->num_tails)
);
size_t dynamic_count = 0u;
if (t->num_tails > 0) {
TrieIndex i = 0;
dynamic_count += (
sizeof (t->tails[i].next_free) + sizeof (t->tails[i].data)
+ sizeof (int16) /* length */
) * t->num_tails;
for (; i < t->num_tails; i++) {
if (t->tails[i].suffix)
{
dynamic_count += trie_char_strsize (t->tails[i].suffix);
}
}
}
return static_count + dynamic_count;
}
int
tail_serialize (const Tail *t, uint8 **ptr)
{
TrieIndex i;
serialize_int32_be_incr (ptr, TAIL_SIGNATURE);
serialize_int32_be_incr (ptr, t->first_free);
serialize_int32_be_incr (ptr, t->num_tails);
for (i = 0; i < t->num_tails; i++) {
int16 length;
serialize_int32_be_incr (ptr, t->tails[i].next_free);
serialize_int32_be_incr (ptr, t->tails[i].data);
length = t->tails[i].suffix ? trie_char_strsize (t->tails[i].suffix)
: 0;
serialize_int16_be_incr (ptr, length);
if (length)
{
memcpy (*ptr, (char *)t->tails[i].suffix, length);
*ptr += length;
}
}
return 0;
}
2022-05-14 00:40:53 +08:00
/**
* @brief Get suffix
*
* @param t : the tail data
* @param index : the index of the suffix
*
* @return pointer to the indexed suffix string.
*
* Get suffix from tail with given @a index. The returned string is a pointer
* to internal storage, which should be accessed read-only by the caller.
* No need to free() it.
*/
const TrieChar *
tail_get_suffix (const Tail *t, TrieIndex index)
{
index -= TAIL_START_BLOCKNO;
return LIKELY (index < t->num_tails) ? t->tails[index].suffix : NULL;
}
/**
* @brief Set suffix of existing entry
*
* @param t : the tail data
* @param index : the index of the suffix
* @param suffix : the new suffix
*
* Set suffix of existing entry of given @a index in tail.
*/
Bool
tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix)
{
index -= TAIL_START_BLOCKNO;
if (LIKELY (index < t->num_tails)) {
/* suffix and t->tails[index].suffix may overlap;
* so, dup it before it's overwritten
*/
TrieChar *tmp = NULL;
if (suffix) {
2024-04-24 14:24:51 +08:00
tmp = trie_char_strdup (suffix);
2022-05-14 00:40:53 +08:00
if (UNLIKELY (!tmp))
return FALSE;
}
if (t->tails[index].suffix)
free (t->tails[index].suffix);
t->tails[index].suffix = tmp;
return TRUE;
}
return FALSE;
}
/**
* @brief Add a new suffix
*
* @param t : the tail data
* @param suffix : the new suffix
*
* @return the index of the newly added suffix,
* or TRIE_INDEX_ERROR on failure.
*
* Add a new suffix entry to tail.
*/
TrieIndex
tail_add_suffix (Tail *t, const TrieChar *suffix)
{
TrieIndex new_block;
new_block = tail_alloc_block (t);
if (UNLIKELY (TRIE_INDEX_ERROR == new_block))
return TRIE_INDEX_ERROR;
tail_set_suffix (t, new_block, suffix);
return new_block;
}
static TrieIndex
tail_alloc_block (Tail *t)
{
TrieIndex block;
if (0 != t->first_free) {
block = t->first_free;
t->first_free = t->tails[block].next_free;
} else {
void *new_block;
block = t->num_tails;
new_block = realloc (t->tails, (t->num_tails + 1) * sizeof (TailBlock));
if (UNLIKELY (!new_block))
return TRIE_INDEX_ERROR;
t->tails = (TailBlock *) new_block;
++t->num_tails;
}
t->tails[block].next_free = -1;
t->tails[block].data = TRIE_DATA_ERROR;
t->tails[block].suffix = NULL;
return block + TAIL_START_BLOCKNO;
}
static void
tail_free_block (Tail *t, TrieIndex block)
{
TrieIndex i, j;
block -= TAIL_START_BLOCKNO;
if (block >= t->num_tails)
return;
t->tails[block].data = TRIE_DATA_ERROR;
if (NULL != t->tails[block].suffix) {
free (t->tails[block].suffix);
t->tails[block].suffix = NULL;
}
/* find insertion point */
j = 0;
for (i = t->first_free; i != 0 && i < block; i = t->tails[i].next_free)
j = i;
/* insert free block between j and i */
t->tails[block].next_free = i;
if (0 != j)
t->tails[j].next_free = block;
else
t->first_free = block;
}
/**
* @brief Get data associated to suffix entry
*
* @param t : the tail data
* @param index : the index of the suffix
*
* @return the data associated to the suffix entry
*
* Get data associated to suffix entry @a index in tail data.
*/
TrieData
tail_get_data (const Tail *t, TrieIndex index)
{
index -= TAIL_START_BLOCKNO;
return LIKELY (index < t->num_tails)
? t->tails[index].data : TRIE_DATA_ERROR;
}
/**
* @brief Set data associated to suffix entry
*
* @param t : the tail data
* @param index : the index of the suffix
* @param data : the data to set
*
* @return boolean indicating success
*
* Set data associated to suffix entry @a index in tail data.
*/
Bool
tail_set_data (Tail *t, TrieIndex index, TrieData data)
{
index -= TAIL_START_BLOCKNO;
if (LIKELY (index < t->num_tails)) {
t->tails[index].data = data;
return TRUE;
}
return FALSE;
}
/**
* @brief Delete suffix entry
*
* @param t : the tail data
* @param index : the index of the suffix to delete
*
* Delete suffix entry from the tail data.
*/
void
tail_delete (Tail *t, TrieIndex index)
{
tail_free_block (t, index);
}
/**
* @brief Walk in tail with a string
*
* @param t : the tail data
* @param s : the tail data index
* @param suffix_idx : pointer to current character index in suffix
* @param str : the string to use in walking
* @param len : total characters in @a str to walk
*
* @return total number of characters successfully walked
*
* Walk in the tail data @a t at entry @a s, from given character position
* @a *suffix_idx, using @a len characters of given string @a str. On return,
* @a *suffix_idx is updated to the position after the last successful walk,
* and the function returns the total number of character succesfully walked.
*/
int
tail_walk_str (const Tail *t,
TrieIndex s,
short *suffix_idx,
const TrieChar *str,
int len)
{
const TrieChar *suffix;
int i;
short j;
suffix = tail_get_suffix (t, s);
if (UNLIKELY (!suffix))
return FALSE;
i = 0; j = *suffix_idx;
while (i < len) {
if (str[i] != suffix[j])
break;
++i;
/* stop and stay at null-terminator */
2024-04-24 14:24:51 +08:00
if (TRIE_CHAR_TERM == suffix[j])
2022-05-14 00:40:53 +08:00
break;
++j;
}
*suffix_idx = j;
return i;
}
/**
* @brief Walk in tail with a character
*
* @param t : the tail data
* @param s : the tail data index
* @param suffix_idx : pointer to current character index in suffix
* @param c : the character to use in walking
*
* @return boolean indicating success
*
* Walk in the tail data @a t at entry @a s, from given character position
* @a *suffix_idx, using given character @a c. If the walk is successful,
* it returns TRUE, and @a *suffix_idx is updated to the next character.
* Otherwise, it returns FALSE, and @a *suffix_idx is left unchanged.
*/
Bool
tail_walk_char (const Tail *t,
TrieIndex s,
short *suffix_idx,
TrieChar c)
{
const TrieChar *suffix;
TrieChar suffix_char;
suffix = tail_get_suffix (t, s);
if (UNLIKELY (!suffix))
return FALSE;
suffix_char = suffix[*suffix_idx];
if (suffix_char == c) {
2024-04-24 14:24:51 +08:00
if (TRIE_CHAR_TERM != suffix_char)
2022-05-14 00:40:53 +08:00
++*suffix_idx;
return TRUE;
}
return FALSE;
}
/*
vi:ts=4:ai:expandtab
*/