日本語文字コードの自動判別方法

http://pub.cozmixng.org/~the-rwiki/rw-cgi.rb?cmd=view;name=%A4%BF%A4%E0%A4%E9%3A%3A%C6%FC%CB%DC%B8%EC%CA%B8%BB%FA%A5%B3%A1%BC%A5%C9%A4%CE%BC%AB%C6%B0%C8%BD%C4%EA
によるとGaucheのものが一番よさそうです。

ということで、Gaucheの日本語文字コードの自動判別ルーチンを別のC言語のプログラムで使えるようにひっぺがしてみました。以下、その手順です。

$ tar zxvf Gauche-0.8.5.tgz
$ cd Gauche-0.8.5
$ ./configure
$ make
$ cp ext/charconv/guess.c /path/to/another/dir
$ cp ext/charconv/guess_tab.c /path/to/another/dir
$ cd /path/to/another/dir

guess.cに以下のpatchを当てます。

--- guess.c.orig	2005-08-27 13:21:44.000000000 +0900
+++ guess.c	2005-08-27 13:23:33.000000000 +0900
@@ -33,9 +33,12 @@
  *  $Id: guess.c,v 1.4 2004/10/06 09:25:36 shirok Exp $
  */
 
+#if 0
 #include <gauche.h>
 #include <gauche/extend.h>
 #include "charconv.h"
+#endif
+#include <stdio.h>
 
 typedef struct guess_arc_rec {
     unsigned int next;          /* next state */
@@ -71,7 +74,7 @@
 /* include DFA table generated by guess.scm */
 #include "guess_tab.c"
 
-static const char *guess_jp(const char *buf, int buflen, void *data)
+const char *guess_jp(const char *buf, int buflen, void *data)
 {
     int i;
     guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
@@ -141,7 +144,7 @@
     return NULL;
 }
 
-
+#if 0
 /*
  * Initialization
  */
@@ -150,4 +153,4 @@
 {
     Scm_RegisterCodeGuessingProc("*JP", guess_jp, NULL);
 }
-
+#endif

テストプログラム: test_guess.c (文字コードはEUC-JPで)

#include <stdio.h>

const char *guess_jp(const char *, int, void *);

int
main(int argc, char *argv)
{
    char buf[] = "ほげ";

    printf("%s\n", guess_jp(buf, sizeof(buf) - 1, NULL));
    
    return 0;
}

テストプログラムのMakefile (*BSD専用)

PROG=test_guess
SRCS=test_guess.c guess.c
MAN=

.include <bsd.prog.mk>

テストプログラムの実行結果

$ ./test_guess
EUC-JP