/*
 * chardet.c
 * Copyright 2006-2010 Yoshiki Yazawa, Matti Hämäläinen, and John Lindgren
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions, and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions, and the following disclaimer in the documentation
 *    provided with the distribution.
 *
 * This software is provided "as is" and without any warranty, express or
 * implied. In no event shall the authors be liable for any damages arising from
 * the use of this software.
 */

#include <glib.h>
#include <string.h>
#include <libaudcore/audstrings.h>

#include "debug.h"
#include "i18n.h"
#include "main.h"
#include "misc.h"

#ifdef USE_CHARDET
#  include <libguess.h>
#endif

static char * cd_chardet_to_utf8 (const char * str, int len,
 int * arg_bytes_read, int * arg_bytes_written);

static char * str_to_utf8_fallback (const char * str)
{
    char * out = g_strconcat (str, _("  (invalid UTF-8)"), NULL);

    for (char * c = out; * c; c ++)
    {
        if (* c & 0x80)
            * c = '?';
    }

    return out;
}

static char * cd_str_to_utf8 (const char * str)
{
    char *out_str;

    if (str == NULL)
        return NULL;

    /* Note: Currently, playlist calls this function repeatedly, even
     * if the string is already converted into utf-8.
     * chardet_to_utf8() would convert a valid utf-8 string into a
     * different utf-8 string, if fallback encodings were supplied and
     * the given string could be treated as a string in one of
     * fallback encodings. To avoid this, g_utf8_validate() had been
     * used at the top of evaluation.
     */

    /* Note 2: g_utf8_validate() has so called encapsulated utf-8
     * problem, thus chardet_to_utf8() took the place of that.
     */

    /* Note 3: As introducing madplug, the problem of conversion from
     * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
     * located near the end of chardet_to_utf8(), but it requires utf8
     * validation guard where g_utf8_validate() was. New
     * dfa_validate_utf8() employs libguess' DFA engine to validate
     * utf-8 and can properly distinguish examples of encapsulated
     * utf-8. It is considered to be safe to use as a guard.
     */

    /* Already UTF-8? */
#ifdef USE_CHARDET
    if (libguess_validate_utf8(str, strlen(str)))
        return g_strdup(str);
#else
    if (g_utf8_validate(str, strlen(str), NULL))
        return g_strdup(str);
#endif

    /* chardet encoding detector */
    if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
        return out_str;

    /* all else fails, we mask off character codes >= 128, replace with '?' */
    return str_to_utf8_fallback(str);
}

static char * cd_chardet_to_utf8 (const char * str, int len,
 int * arg_bytes_read, int * arg_bytes_write)
{
    char *ret = NULL;
    int * bytes_read, * bytes_write;
    int my_bytes_read, my_bytes_write;

    bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
    bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;

    g_return_val_if_fail(str != NULL, NULL);

#ifdef USE_CHARDET
    if (libguess_validate_utf8(str, len))
#else
    if (g_utf8_validate(str, len, NULL))
#endif
    {
        if (len < 0)
            len = strlen (str);

        ret = g_malloc (len + 1);
        memcpy (ret, str, len);
        ret[len] = 0;

        if (arg_bytes_read != NULL)
            * arg_bytes_read = len;
        if (arg_bytes_write != NULL)
            * arg_bytes_write = len;

        return ret;
    }

#ifdef USE_CHARDET
    char * det = get_string (NULL, "chardet_detector");

    if (det[0])
    {
        AUDDBG("guess encoding (%s) %s\n", det, str);
        const char * encoding = libguess_determine_encoding (str, len, det);
        AUDDBG("encoding = %s\n", encoding);
        if (encoding)
        {
            gsize read_gsize = 0, written_gsize = 0;
            ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
            * bytes_read = read_gsize;
            * bytes_write = written_gsize;
        }
    }

    g_free (det);
#endif

    /* If detection failed or was not enabled, try fallbacks (if there are any) */
    if (! ret)
    {
        char * fallbacks = get_string (NULL, "chardet_fallback");
        char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);

        for (char * * enc = split; * enc; enc ++)
        {
            gsize read_gsize = 0, written_gsize = 0;
            ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
            * bytes_read = read_gsize;
            * bytes_write = written_gsize;

            if (len == *bytes_read)
                break;
            else {
                g_free(ret);
                ret = NULL;
            }
        }

        g_strfreev (split);
        g_free (fallbacks);
    }

    /* First fallback: locale (duh!) */
    if (ret == NULL)
    {
        gsize read_gsize = 0, written_gsize = 0;
        ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
        * bytes_read = read_gsize;
        * bytes_write = written_gsize;
    }

    /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
    if (ret == NULL)
    {
        gsize read_gsize = 0, written_gsize = 0;
        ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
        * bytes_read = read_gsize;
        * bytes_write = written_gsize;
    }

    if (ret != NULL)
    {
        if (g_utf8_validate(ret, -1, NULL))
            return ret;
        else
        {
            g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
            g_free(ret);
            return NULL;
        }
    }

    return NULL; /* If we have no idea, return NULL. */
}

void chardet_init (void)
{
#ifdef USE_CHARDET
    libguess_determine_encoding(NULL, -1, "");
#endif
    str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
}
