FIELD NOTES: 書を持って街へ出よう

合同会社フィールドワークス プログラマ兼代表のブログ

UTF-8 ⇔ UTF-16 変換関数 その2

昨日作成したUTF-8UTF-16変換関数が,いかにも効率が悪そうな作りだったので,チューニングしてみた。

まあ,巨大な文字列を変換する予定はないので,実用的には大差ないと思うけど。

let explode_to_int s =
  let rec exp i l =
    if i < 0 then l else exp (i - 1) ((int_of_char s.[i]) :: l) in
  exp (String.length s - 1) []

let implode_from_int l =
  let res = String.create (List.length l) in
  let rec imp i = function
  | [] -> res
  | c :: l -> res.[i] <- (char_of_int c); imp (i + 1) l in
  imp 0 l

(* UTF-8 → UTF-16BE(BOM付) 変換 *)
let utf8_to_utf16be s =
  let hi i = (i lsr 8) land 0xff in
  let lo i = i land 0xff in
  let rec loop = function
    | [] -> []
    | ch0 :: rest when (ch0 land 0x80) = 0x00 ->
        let wch = ch0 land 0x7f in
        (hi wch) :: (lo wch) :: loop rest
    | ch0 :: ch1 :: rest when (ch0 land 0xe0) = 0xc0 ->
        let wch = ((ch0 land 0x3f) lsl 6) lor
                   (ch1 land 0x3f) in
        (hi wch) :: (lo wch) :: loop rest
    | ch0 :: ch1 :: ch2 :: rest ->
        let wch = ((ch0 land 0x0f) lsl 12) lor
                  ((ch1 land 0x3f) lsl 6) lor
                   (ch2 land 0x3f) in
        (hi wch) :: (lo wch) :: loop rest
    | _ -> raise (Invalid_argument s)
  in
  implode_from_int (0xfe :: 0xff :: (loop (explode_to_int s)))

(* UTF-16BE(BOM付) → UTF-8 変換 *)
let utf16be_to_utf8 s =
  let rec loop = function
    | [] -> []
    | ch0 :: ch1 :: rest ->
        let wch = (ch0 lsl 8) lor ch1 in
        if wch <= 0x007f then
            wch land 0x007f :: loop rest
        else if wch <= 0x07ff then
          ((wch land 0x07c0) lsr  6) lor 0xc0 ::
          ((wch land 0x003f)       ) lor 0x80 :: loop rest
        else 
          ((wch land 0xf000) lsr 12) lor 0xe0 ::
          ((wch land 0x0fc0) lsr  6) lor 0x80 ::
          ((wch land 0x003f)       ) lor 0x80 :: loop rest
    | _ -> raise (Invalid_argument s)
  in
  let chars =
    match explode_to_int s with
    | 0xfe :: 0xff :: rest -> loop rest
    | s -> s
  in
  implode_from_int chars