FIELD NOTES: 書を持って街へ出よう

合同会社フィールドワークス プログラマ兼代表のブログ

UTF-8 ⇔ UTF-16 変換関数 (OCaml)

UTF-8UTF-16の文字列を相互変換する関数を作成した。
リスト処理を贅沢に使っているので,遅いかもしれない。

open ExtString

(* UTF-8 → UTF-16BE(BOM付) 変換 *)
let utf8_to_utf16be s =
  let rec loop = function
    | [] -> []
    | ch0 :: rest when (ch0 land 0x80) = 0x00 ->
          (ch0 land 0x7f) :: loop rest
    | ch0 :: ch1 :: rest when (ch0 land 0xe0) = 0xc0 ->
        (((ch0 land 0x3f) lsl 6) lor
          (ch1 land 0x3f)) :: loop rest
    | ch0 :: ch1 :: ch2 :: rest ->
        (((ch0 land 0x0f) lsl 12) lor
         ((ch1 land 0x3f) lsl 6) lor
          (ch2 land 0x3f)) :: loop rest
    | _ -> raise (Invalid_argument s)
  in
  let chars_of_int i =
    [char_of_int ((i lsr 8) land 0xff); char_of_int (i land 0xff)]
  in
  let utf16s = loop (List.map int_of_char (String.explode s)) in
  String.implode (List.flatten (List.map chars_of_int (0xfeff :: utf16s)))

(* UTF-16BE(BOM付) → UTF-8 変換 *)
let utf16be_to_utf8 s =
  let rec loop = function
    | [] -> []
    | ch0 :: ch1 :: rest ->
        let wch = (ch0 lsl 8) lor ch1 in
        let bytes =
          if wch <= 0x007f then
            [wch land 0x007f]
          else if wch <= 0x07ff then
            [((wch land 0x07c0) lsr  6) lor 0xc0;
             ((wch land 0x003f)       ) lor 0x80]
          else 
            [((wch land 0xf000) lsr 12) lor 0xe0;
             ((wch land 0x0fc0) lsr  6) lor 0x80;
             ((wch land 0x003f)       ) lor 0x80]
        in
        bytes @ loop rest
    | _ -> raise (Invalid_argument s)
  in
  let chars =
    match String.explode s with
    | '\xfe' :: '\xff' :: rest ->
        List.map char_of_int (loop (List.map int_of_char rest))
    | s -> s
  in
  String.implode chars