UTF-8 ⇔ UTF-16 変換関数 その2
昨日作成したUTF-8⇔UTF-16変換関数が,いかにも効率が悪そうな作りだったので,チューニングしてみた。
まあ,巨大な文字列を変換する予定はないので,実用的には大差ないと思うけど。
let explode_to_int s = let rec exp i l = if i < 0 then l else exp (i - 1) ((int_of_char s.[i]) :: l) in exp (String.length s - 1) [] let implode_from_int l = let res = String.create (List.length l) in let rec imp i = function | [] -> res | c :: l -> res.[i] <- (char_of_int c); imp (i + 1) l in imp 0 l (* UTF-8 → UTF-16BE(BOM付) 変換 *) let utf8_to_utf16be s = let hi i = (i lsr 8) land 0xff in let lo i = i land 0xff in let rec loop = function | [] -> [] | ch0 :: rest when (ch0 land 0x80) = 0x00 -> let wch = ch0 land 0x7f in (hi wch) :: (lo wch) :: loop rest | ch0 :: ch1 :: rest when (ch0 land 0xe0) = 0xc0 -> let wch = ((ch0 land 0x3f) lsl 6) lor (ch1 land 0x3f) in (hi wch) :: (lo wch) :: loop rest | ch0 :: ch1 :: ch2 :: rest -> let wch = ((ch0 land 0x0f) lsl 12) lor ((ch1 land 0x3f) lsl 6) lor (ch2 land 0x3f) in (hi wch) :: (lo wch) :: loop rest | _ -> raise (Invalid_argument s) in implode_from_int (0xfe :: 0xff :: (loop (explode_to_int s))) (* UTF-16BE(BOM付) → UTF-8 変換 *) let utf16be_to_utf8 s = let rec loop = function | [] -> [] | ch0 :: ch1 :: rest -> let wch = (ch0 lsl 8) lor ch1 in if wch <= 0x007f then wch land 0x007f :: loop rest else if wch <= 0x07ff then ((wch land 0x07c0) lsr 6) lor 0xc0 :: ((wch land 0x003f) ) lor 0x80 :: loop rest else ((wch land 0xf000) lsr 12) lor 0xe0 :: ((wch land 0x0fc0) lsr 6) lor 0x80 :: ((wch land 0x003f) ) lor 0x80 :: loop rest | _ -> raise (Invalid_argument s) in let chars = match explode_to_int s with | 0xfe :: 0xff :: rest -> loop rest | s -> s in implode_from_int chars