Optional use of UTF-8 strings in resource bundles

Allows the use of UTF-8 for packing resources instead of the default of UTF-16 for Java. When strings are extracted from the ResStringPool, they are converted to UTF-16 and the result is cached for subsequent calls. When using aapt to package, add in the "-8" switch to pack the resources using UTF-8. This will result in the value, key, and type strings as well as the compiled XML string values taking significantly less space in the final application package in most scenarios. Change-Id: I129483f8b3d3b1c5869dced05cb525e494a6c83a
2009-12-04 09:38:48 -08:00 · 2009-12-04 09:38:48 -08:00 · 9a2d83e698
parent 09b41cbf9f
commit 9a2d83e698
6 changed files with 203 additions and 48 deletions
--- a/include/utils/ResourceTypes.h
+++ b/include/utils/ResourceTypes.h
@ -393,7 +393,10 @@ struct ResStringPool_header
    enum {
        // If set, the string index is sorted by the string values (based
        // on strcmp16()).
-        SORTED_FLAG = 1<<0
+        SORTED_FLAG = 1<<0,
+
+        // String pool is encoded in UTF-8
+        UTF8_FLAG = 1<<8
    };
    uint32_t flags;

@ -456,9 +459,11 @@ private:
    void*                       mOwnedData;
    const ResStringPool_header* mHeader;
    size_t                      mSize;
+    mutable Mutex               mDecodeLock;
    const uint32_t*             mEntries;
    const uint32_t*             mEntryStyles;
-    const char16_t*             mStrings;
+    const void*                 mStrings;
+    char16_t**                  mCache;
    uint32_t                    mStringPoolSize;    // number of uint16_t
    const uint32_t*             mStyles;
    uint32_t                    mStylePoolSize;    // number of uint32_t
--- a/include/utils/String16.h
+++ b/include/utils/String16.h
@ -49,12 +49,17 @@ int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2);
 // Version of strzcmp16 for comparing strings in different endianness.
 int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);

+// Convert UTF-8 to UTF-16 including surrogate pairs
+void utf8_to_utf16(const uint8_t *src, size_t srcLen, char16_t* dst, const size_t dstLen);
+
 }

 // ---------------------------------------------------------------------------

 namespace android {

+// ---------------------------------------------------------------------------
+
 class String8;
 class TextOutput;

--- a/include/utils/String8.h
+++ b/include/utils/String8.h
@ -57,6 +57,11 @@ size_t utf8_length(const char *src);
 */
 size_t utf32_length(const char *src, size_t src_len);

+/*
+ * Returns the UTF-8 length of "src".
+ */
+size_t utf8_length_from_utf16(const char16_t *src, size_t src_len);
+
 /*
 * Returns the UTF-8 length of "src".
 */
@ -120,6 +125,9 @@ size_t utf8_to_utf32(const char* src, size_t src_len,
 size_t utf32_to_utf8(const char32_t* src, size_t src_len,
                     char* dst, size_t dst_len);

+size_t utf16_to_utf8(const char16_t* src, size_t src_len,
+                     char* dst, size_t dst_len);
+
 }

 // ---------------------------------------------------------------------------
--- a/libs/utils/ResourceTypes.cpp
+++ b/libs/utils/ResourceTypes.cpp
@ -229,12 +229,12 @@ Res_png_9patch* Res_png_9patch::deserialize(const void* inData)
 // --------------------------------------------------------------------

 ResStringPool::ResStringPool()
-    : mError(NO_INIT), mOwnedData(NULL)
+    : mError(NO_INIT), mOwnedData(NULL), mHeader(NULL), mCache(NULL)
 {
 }

 ResStringPool::ResStringPool(const void* data, size_t size, bool copyData)
-    : mError(NO_INIT), mOwnedData(NULL)
+    : mError(NO_INIT), mOwnedData(NULL), mHeader(NULL), mCache(NULL)
 {
    setTo(data, size, copyData);
 }
@ -296,7 +296,17 @@ status_t ResStringPool::setTo(const void* data, size_t size, bool copyData)
                    (int)size);
            return (mError=BAD_TYPE);
        }
-        mStrings = (const char16_t*)
+
+        size_t charSize;
+        if (mHeader->flags&ResStringPool_header::UTF8_FLAG) {
+            charSize = sizeof(uint8_t);
+            mCache = (char16_t**)malloc(sizeof(char16_t**)*mHeader->stringCount);
+            memset(mCache, 0, sizeof(char16_t**)*mHeader->stringCount);
+        } else {
+            charSize = sizeof(char16_t);
+        }
+
+        mStrings = (const void*)
            (((const uint8_t*)data)+mHeader->stringsStart);
        if (mHeader->stringsStart >= (mHeader->header.size-sizeof(uint16_t))) {
            LOGW("Bad string block: string pool starts at %d, after total size %d\n",
@ -305,7 +315,7 @@ status_t ResStringPool::setTo(const void* data, size_t size, bool copyData)
        }
        if (mHeader->styleCount == 0) {
            mStringPoolSize =
-                (mHeader->header.size-mHeader->stringsStart)/sizeof(uint16_t);
+                (mHeader->header.size-mHeader->stringsStart)/charSize;
        } else {
            // check invariant: styles follow the strings
            if (mHeader->stylesStart <= mHeader->stringsStart) {
@ -314,7 +324,7 @@ status_t ResStringPool::setTo(const void* data, size_t size, bool copyData)
                return (mError=BAD_TYPE);
            }
            mStringPoolSize =
-                (mHeader->stylesStart-mHeader->stringsStart)/sizeof(uint16_t);
+                (mHeader->stylesStart-mHeader->stringsStart)/charSize;
        }

        // check invariant: stringCount > 0 requires a string pool to exist
@ -329,13 +339,19 @@ status_t ResStringPool::setTo(const void* data, size_t size, bool copyData)
            for (i=0; i<mHeader->stringCount; i++) {
                e[i] = dtohl(mEntries[i]);
            }
-            char16_t* s = const_cast<char16_t*>(mStrings);
-            for (i=0; i<mStringPoolSize; i++) {
-                s[i] = dtohs(mStrings[i]);
+            if (!(mHeader->flags&ResStringPool_header::UTF8_FLAG)) {
+                const char16_t* strings = (const char16_t*)mStrings;
+                char16_t* s = const_cast<char16_t*>(strings);
+                for (i=0; i<mStringPoolSize; i++) {
+                    s[i] = dtohs(strings[i]);
+                }
            }
        }

-        if (mStrings[mStringPoolSize-1] != 0) {
+        if ((mHeader->flags&ResStringPool_header::UTF8_FLAG &&
+                ((uint8_t*)mStrings)[mStringPoolSize-1] != 0) ||
+                (!mHeader->flags&ResStringPool_header::UTF8_FLAG &&
+                ((char16_t*)mStrings)[mStringPoolSize-1] != 0)) {
            LOGW("Bad string block: last string is not 0-terminated\n");
            return (mError=BAD_TYPE);
        }
@ -410,24 +426,67 @@ void ResStringPool::uninit()
        free(mOwnedData);
        mOwnedData = NULL;
    }
+    if (mHeader != NULL && mCache != NULL) {
+        for (size_t x = 0; x < mHeader->stringCount; x++) {
+            if (mCache[x] != NULL) {
+                free(mCache[x]);
+                mCache[x] = NULL;
+            }
+        }
+        free(mCache);
+        mCache = NULL;
+    }
 }

+#define DECODE_LENGTH(str, chrsz, len) \
+    len = *(str); \
+    if (*(str)&(1<<(chrsz*8-1))) { \
+        (str)++; \
+        len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \
+    } \
+    (str)++;
+
 const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
 {
    if (mError == NO_ERROR && idx < mHeader->stringCount) {
-        const uint32_t off = (mEntries[idx]/sizeof(uint16_t));
+        const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0;
+        const uint32_t off = mEntries[idx]/(isUTF8?sizeof(char):sizeof(char16_t));
        if (off < (mStringPoolSize-1)) {
-            const char16_t* str = mStrings+off;
-            *outLen = *str;
-            if ((*str)&0x8000) {
-                str++;
-                *outLen = (((*outLen)&0x7fff)<<16) + *str;
-            }
-            if ((uint32_t)(str+1+*outLen-mStrings) < mStringPoolSize) {
-                return str+1;
+            if (!isUTF8) {
+                const char16_t* strings = (char16_t*)mStrings;
+                const char16_t* str = strings+off;
+                DECODE_LENGTH(str, sizeof(char16_t), *outLen)
+                if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) {
+                    return str;
+                } else {
+                    LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
+                            (int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize);
+                }
            } else {
-                LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
-                        (int)idx, (int)(str+1+*outLen-mStrings), (int)mStringPoolSize);
+                const uint8_t* strings = (uint8_t*)mStrings;
+                const uint8_t* str = strings+off;
+                DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
+                size_t encLen;
+                DECODE_LENGTH(str, sizeof(uint8_t), encLen)
+                if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
+                    AutoMutex lock(mDecodeLock);
+                    if (mCache[idx] != NULL) {
+                        return mCache[idx];
+                    }
+                    char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t));
+                    if (!u16str) {
+                        LOGW("No memory when trying to allocate decode cache for string #%d\n",
+                                (int)idx);
+                        return NULL;
+                    }
+                    const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str);
+                    utf8_to_utf16(u8src, encLen, u16str, *outLen);
+                    mCache[idx] = u16str;
+                    return u16str;
+                } else {
+                    LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
+                            (int)idx, (int)(str+encLen-strings), (int)mStringPoolSize);
+                }
            }
        } else {
            LOGW("Bad string block: string #%d entry is at %d, past end at %d\n",
@ -466,6 +525,10 @@ ssize_t ResStringPool::indexOfString(const char16_t* str, size_t strLen) const

    size_t len;

+    // TODO optimize searching for UTF-8 strings taking into account
+    // the cache fill to determine when to convert the searched-for
+    // string key to UTF-8.
+
    if (mHeader->flags&ResStringPool_header::SORTED_FLAG) {
        // Do a binary search for the string...
        ssize_t l = 0;
@ -1043,6 +1106,7 @@ status_t ResXMLTree::getError() const
 void ResXMLTree::uninit()
 {
    mError = NO_INIT;
+    mStrings.uninit();
    if (mOwnedData) {
        free(mOwnedData);
        mOwnedData = NULL;
--- a/libs/utils/String16.cpp
+++ b/libs/utils/String16.cpp
@ -172,10 +172,6 @@ int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2
           : 0);
 }

-// ---------------------------------------------------------------------------
-
-namespace android {
-
 static inline size_t
 utf8_char_len(uint8_t ch)
 {
@ -215,8 +211,38 @@ utf8_to_utf32(const uint8_t *src, size_t length)
    //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
 }

+void
+utf8_to_utf16(const uint8_t *src, size_t srcLen,
+        char16_t* dst, const size_t dstLen)
+{
+    const uint8_t* const end = src + srcLen;
+    const char16_t* const dstEnd = dst + dstLen;
+    while (src < end && dst < dstEnd) {
+        size_t len = utf8_char_len(*src);
+        uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len);
+
+        // Convert the UTF32 codepoint to one or more UTF16 codepoints
+        if (codepoint <= 0xFFFF) {
+            // Single UTF16 character
+            *dst++ = (char16_t) codepoint;
+        } else {
+            // Multiple UTF16 characters with surrogates
+            codepoint = codepoint - 0x10000;
+            *dst++ = (char16_t) ((codepoint >> 10) + 0xD800);
+            *dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
+        }
+
+        src += len;
+    }
+    if (dst < dstEnd) {
+        *dst = 0;
+    }
+}
+
 // ---------------------------------------------------------------------------

+namespace android {
+
 static SharedBuffer* gEmptyStringBuf = NULL;
 static char16_t* gEmptyString = NULL;

@ -260,30 +286,14 @@ static char16_t* allocFromUTF8(const char* in, size_t len)
        p += utf8len;
    }
    
-    SharedBuffer* buf = SharedBuffer::alloc((chars+1)*sizeof(char16_t));
+    size_t bufSize = (chars+1)*sizeof(char16_t);
+    SharedBuffer* buf = SharedBuffer::alloc(bufSize);
    if (buf) {
        p = in;
        char16_t* str = (char16_t*)buf->data();
-        char16_t* d = str;
-        while (p < end) {
-            size_t len = utf8_char_len(*p);
-            uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, len);
-
-            // Convert the UTF32 codepoint to one or more UTF16 codepoints
-            if (codepoint <= 0xFFFF) {
-                // Single UTF16 character
-                *d++ = (char16_t) codepoint;
-            } else {
-                // Multiple UTF16 characters with surrogates
-                codepoint = codepoint - 0x10000;
-                *d++ = (char16_t) ((codepoint >> 10) + 0xD800);
-                *d++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
-            }
-
-            p += len;
-        }
-        *d = 0;
        
+        utf8_to_utf16((const uint8_t*)p, len, str, bufSize);
+
        //printf("Created UTF-16 string from UTF-8 \"%s\":", in);
        //printHexData(1, str, buf->size(), 16, 1);
        //printf("\n");
--- a/libs/utils/String8.cpp
+++ b/libs/utils/String8.cpp
@ -208,10 +208,23 @@ static char* allocFromUTF16OrUTF32(const T* in, L len)
    return getEmptyString();
 }

-// Note: not dealing with expanding surrogate pairs.
 static char* allocFromUTF16(const char16_t* in, size_t len)
 {
-    return allocFromUTF16OrUTF32<char16_t, size_t>(in, len);
+    if (len == 0) return getEmptyString();
+
+    const size_t bytes = utf8_length_from_utf16(in, len);
+
+    SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
+    LOG_ASSERT(buf, "Unable to allocate shared buffer");
+    if (buf) {
+        char* str = (char*)buf->data();
+
+        utf16_to_utf8(in, len, str, bytes+1);
+
+        return str;
+    }
+
+    return getEmptyString();
 }

 static char* allocFromUTF32(const char32_t* in, size_t len)
@ -762,6 +775,26 @@ size_t utf8_length_from_utf32(const char32_t *src, size_t src_len)
    return ret;
 }

+size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
+{
+    if (src == NULL || src_len == 0) {
+        return 0;
+    }
+    size_t ret = 0;
+    const char16_t* const end = src + src_len;
+    while (src < end) {
+        if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
+                && (*++src & 0xFC00) == 0xDC00) {
+            // surrogate pairs are always 4 bytes.
+            ret += 4;
+            src++;
+        } else {
+            ret += android::utf32_to_utf8_bytes((char32_t) *src++);
+        }
+    }
+    return ret;
+}
+
 static int32_t utf32_at_internal(const char* cur, size_t *num_read)
 {
    const char first_char = *cur;
@ -848,3 +881,33 @@ size_t utf32_to_utf8(const char32_t* src, size_t src_len,
    }
    return cur - dst;
 }
+
+size_t utf16_to_utf8(const char16_t* src, size_t src_len,
+                     char* dst, size_t dst_len)
+{
+    if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
+        return 0;
+    }
+    const char16_t* cur_utf16 = src;
+    const char16_t* const end_utf16 = src + src_len;
+    char *cur = dst;
+    const char* const end = dst + dst_len;
+    while (cur_utf16 < end_utf16 && cur < end) {
+        char32_t utf32;
+        // surrogate pairs
+        if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
+            utf32 = (*cur_utf16++ - 0xD800) << 10;
+            utf32 |= *cur_utf16++ - 0xDC00;
+            utf32 += 0x10000;
+        } else {
+            utf32 = (char32_t) *cur_utf16++;
+        }
+        size_t len = android::utf32_to_utf8_bytes(utf32);
+        android::utf32_to_utf8((uint8_t*)cur, utf32, len);
+        cur += len;
+    }
+    if (cur < end) {
+        *cur = '\0';
+    }
+    return cur - dst;
+}