summaryrefslogtreecommitdiff
path: root/src/syntax/sedlex_cset.ml
diff options
context:
space:
mode:
Diffstat (limited to 'src/syntax/sedlex_cset.ml')
-rw-r--r--src/syntax/sedlex_cset.ml595
1 files changed, 1 insertions, 594 deletions
diff --git a/src/syntax/sedlex_cset.ml b/src/syntax/sedlex_cset.ml
index 6bee853..eff272c 100644
--- a/src/syntax/sedlex_cset.ml
+++ b/src/syntax/sedlex_cset.ml
@@ -2,597 +2,4 @@
(* See the attached LICENSE file. *)
(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)
-(* Character sets are represented as lists of intervals. The
- intervals must be non-overlapping and not collapsable, and the list
- must be ordered in increasing order. *)
-
-type t = (int * int) list
-
-let max_code = 0x10ffff (* must be < max_int *)
-
-let min_code = -1
-let empty = []
-let singleton i = [(i, i)]
-let is_empty = function [] -> true | _ -> false
-let interval i j = if i <= j then [(i, j)] else [(j, i)]
-let eof = singleton (-1)
-let any = interval 0 max_code
-
-let rec union c1 c2 =
- match (c1, c2) with
- | [], _ -> c2
- | _, [] -> c1
- | ((i1, j1) as s1) :: r1, (i2, j2) :: r2 ->
- if i1 <= i2 then
- if j1 + 1 < i2 then s1 :: union r1 c2
- else if j1 < j2 then union r1 ((i1, j2) :: r2)
- else union c1 r2
- else union c2 c1
-
-let complement c =
- let rec aux start = function
- | [] -> if start <= max_code then [(start, max_code)] else []
- | (i, j) :: l -> (start, i - 1) :: aux (succ j) l
- in
- match c with (-1, j) :: l -> aux (succ j) l | l -> aux (-1) l
-
-let intersection c1 c2 = complement (union (complement c1) (complement c2))
-let difference c1 c2 = complement (union (complement c1) c2)
-
-(* Unicode classes from XML *)
-
-let base_char =
- [
- (0x0041, 0x005A);
- (0x0061, 0x007A);
- (0x00C0, 0x00D6);
- (0x00D8, 0x00F6);
- (0x00F8, 0x00FF);
- (0x0100, 0x0131);
- (0x0134, 0x013E);
- (0x0141, 0x0148);
- (0x014A, 0x017E);
- (0x0180, 0x01C3);
- (0x01CD, 0x01F0);
- (0x01F4, 0x01F5);
- (0x01FA, 0x0217);
- (0x0250, 0x02A8);
- (0x02BB, 0x02C1);
- (0x0386, 0x0386);
- (0x0388, 0x038A);
- (0x038C, 0x038C);
- (0x038E, 0x03A1);
- (0x03A3, 0x03CE);
- (0x03D0, 0x03D6);
- (0x03DA, 0x03DA);
- (0x03DC, 0x03DC);
- (0x03DE, 0x03DE);
- (0x03E0, 0x03E0);
- (0x03E2, 0x03F3);
- (0x0401, 0x040C);
- (0x040E, 0x044F);
- (0x0451, 0x045C);
- (0x045E, 0x0481);
- (0x0490, 0x04C4);
- (0x04C7, 0x04C8);
- (0x04CB, 0x04CC);
- (0x04D0, 0x04EB);
- (0x04EE, 0x04F5);
- (0x04F8, 0x04F9);
- (0x0531, 0x0556);
- (0x0559, 0x0559);
- (0x0561, 0x0586);
- (0x05D0, 0x05EA);
- (0x05F0, 0x05F2);
- (0x0621, 0x063A);
- (0x0641, 0x064A);
- (0x0671, 0x06B7);
- (0x06BA, 0x06BE);
- (0x06C0, 0x06CE);
- (0x06D0, 0x06D3);
- (0x06D5, 0x06D5);
- (0x06E5, 0x06E6);
- (0x0905, 0x0939);
- (0x093D, 0x093D);
- (0x0958, 0x0961);
- (0x0985, 0x098C);
- (0x098F, 0x0990);
- (0x0993, 0x09A8);
- (0x09AA, 0x09B0);
- (0x09B2, 0x09B2);
- (0x09B6, 0x09B9);
- (0x09DC, 0x09DD);
- (0x09DF, 0x09E1);
- (0x09F0, 0x09F1);
- (0x0A05, 0x0A0A);
- (0x0A0F, 0x0A10);
- (0x0A13, 0x0A28);
- (0x0A2A, 0x0A30);
- (0x0A32, 0x0A33);
- (0x0A35, 0x0A36);
- (0x0A38, 0x0A39);
- (0x0A59, 0x0A5C);
- (0x0A5E, 0x0A5E);
- (0x0A72, 0x0A74);
- (0x0A85, 0x0A8B);
- (0x0A8D, 0x0A8D);
- (0x0A8F, 0x0A91);
- (0x0A93, 0x0AA8);
- (0x0AAA, 0x0AB0);
- (0x0AB2, 0x0AB3);
- (0x0AB5, 0x0AB9);
- (0x0ABD, 0x0ABD);
- (0x0AE0, 0x0AE0);
- (0x0B05, 0x0B0C);
- (0x0B0F, 0x0B10);
- (0x0B13, 0x0B28);
- (0x0B2A, 0x0B30);
- (0x0B32, 0x0B33);
- (0x0B36, 0x0B39);
- (0x0B3D, 0x0B3D);
- (0x0B5C, 0x0B5D);
- (0x0B5F, 0x0B61);
- (0x0B85, 0x0B8A);
- (0x0B8E, 0x0B90);
- (0x0B92, 0x0B95);
- (0x0B99, 0x0B9A);
- (0x0B9C, 0x0B9C);
- (0x0B9E, 0x0B9F);
- (0x0BA3, 0x0BA4);
- (0x0BA8, 0x0BAA);
- (0x0BAE, 0x0BB5);
- (0x0BB7, 0x0BB9);
- (0x0C05, 0x0C0C);
- (0x0C0E, 0x0C10);
- (0x0C12, 0x0C28);
- (0x0C2A, 0x0C33);
- (0x0C35, 0x0C39);
- (0x0C60, 0x0C61);
- (0x0C85, 0x0C8C);
- (0x0C8E, 0x0C90);
- (0x0C92, 0x0CA8);
- (0x0CAA, 0x0CB3);
- (0x0CB5, 0x0CB9);
- (0x0CDE, 0x0CDE);
- (0x0CE0, 0x0CE1);
- (0x0D05, 0x0D0C);
- (0x0D0E, 0x0D10);
- (0x0D12, 0x0D28);
- (0x0D2A, 0x0D39);
- (0x0D60, 0x0D61);
- (0x0E01, 0x0E2E);
- (0x0E30, 0x0E30);
- (0x0E32, 0x0E33);
- (0x0E40, 0x0E45);
- (0x0E81, 0x0E82);
- (0x0E84, 0x0E84);
- (0x0E87, 0x0E88);
- (0x0E8A, 0x0E8A);
- (0x0E8D, 0x0E8D);
- (0x0E94, 0x0E97);
- (0x0E99, 0x0E9F);
- (0x0EA1, 0x0EA3);
- (0x0EA5, 0x0EA5);
- (0x0EA7, 0x0EA7);
- (0x0EAA, 0x0EAB);
- (0x0EAD, 0x0EAE);
- (0x0EB0, 0x0EB0);
- (0x0EB2, 0x0EB3);
- (0x0EBD, 0x0EBD);
- (0x0EC0, 0x0EC4);
- (0x0F40, 0x0F47);
- (0x0F49, 0x0F69);
- (0x10A0, 0x10C5);
- (0x10D0, 0x10F6);
- (0x1100, 0x1100);
- (0x1102, 0x1103);
- (0x1105, 0x1107);
- (0x1109, 0x1109);
- (0x110B, 0x110C);
- (0x110E, 0x1112);
- (0x113C, 0x113C);
- (0x113E, 0x113E);
- (0x1140, 0x1140);
- (0x114C, 0x114C);
- (0x114E, 0x114E);
- (0x1150, 0x1150);
- (0x1154, 0x1155);
- (0x1159, 0x1159);
- (0x115F, 0x1161);
- (0x1163, 0x1163);
- (0x1165, 0x1165);
- (0x1167, 0x1167);
- (0x1169, 0x1169);
- (0x116D, 0x116E);
- (0x1172, 0x1173);
- (0x1175, 0x1175);
- (0x119E, 0x119E);
- (0x11A8, 0x11A8);
- (0x11AB, 0x11AB);
- (0x11AE, 0x11AF);
- (0x11B7, 0x11B8);
- (0x11BA, 0x11BA);
- (0x11BC, 0x11C2);
- (0x11EB, 0x11EB);
- (0x11F0, 0x11F0);
- (0x11F9, 0x11F9);
- (0x1E00, 0x1E9B);
- (0x1EA0, 0x1EF9);
- (0x1F00, 0x1F15);
- (0x1F18, 0x1F1D);
- (0x1F20, 0x1F45);
- (0x1F48, 0x1F4D);
- (0x1F50, 0x1F57);
- (0x1F59, 0x1F59);
- (0x1F5B, 0x1F5B);
- (0x1F5D, 0x1F5D);
- (0x1F5F, 0x1F7D);
- (0x1F80, 0x1FB4);
- (0x1FB6, 0x1FBC);
- (0x1FBE, 0x1FBE);
- (0x1FC2, 0x1FC4);
- (0x1FC6, 0x1FCC);
- (0x1FD0, 0x1FD3);
- (0x1FD6, 0x1FDB);
- (0x1FE0, 0x1FEC);
- (0x1FF2, 0x1FF4);
- (0x1FF6, 0x1FFC);
- (0x2126, 0x2126);
- (0x212A, 0x212B);
- (0x212E, 0x212E);
- (0x2180, 0x2182);
- (0x3041, 0x3094);
- (0x30A1, 0x30FA);
- (0x3105, 0x312C);
- (0xAC00, 0xD7A3);
- ]
-
-let ideographic = [(0x3007, 0x3007); (0x3021, 0x3029); (0x4E00, 0x9FA5)]
-
-let combining_char =
- [
- (0x0300, 0x0345);
- (0x0360, 0x0361);
- (0x0483, 0x0486);
- (0x0591, 0x05A1);
- (0x05A3, 0x05B9);
- (0x05BB, 0x05BD);
- (0x05BF, 0x05BF);
- (0x05C1, 0x05C2);
- (0x05C4, 0x05C4);
- (0x064B, 0x0652);
- (0x0670, 0x0670);
- (0x06D6, 0x06DC);
- (0x06DD, 0x06DF);
- (0x06E0, 0x06E4);
- (0x06E7, 0x06E8);
- (0x06EA, 0x06ED);
- (0x0901, 0x0903);
- (0x093C, 0x093C);
- (0x093E, 0x094C);
- (0x094D, 0x094D);
- (0x0951, 0x0954);
- (0x0962, 0x0963);
- (0x0981, 0x0983);
- (0x09BC, 0x09BC);
- (0x09BE, 0x09BE);
- (0x09BF, 0x09BF);
- (0x09C0, 0x09C4);
- (0x09C7, 0x09C8);
- (0x09CB, 0x09CD);
- (0x09D7, 0x09D7);
- (0x09E2, 0x09E3);
- (0x0A02, 0x0A02);
- (0x0A3C, 0x0A3C);
- (0x0A3E, 0x0A3E);
- (0x0A3F, 0x0A3F);
- (0x0A40, 0x0A42);
- (0x0A47, 0x0A48);
- (0x0A4B, 0x0A4D);
- (0x0A70, 0x0A71);
- (0x0A81, 0x0A83);
- (0x0ABC, 0x0ABC);
- (0x0ABE, 0x0AC5);
- (0x0AC7, 0x0AC9);
- (0x0ACB, 0x0ACD);
- (0x0B01, 0x0B03);
- (0x0B3C, 0x0B3C);
- (0x0B3E, 0x0B43);
- (0x0B47, 0x0B48);
- (0x0B4B, 0x0B4D);
- (0x0B56, 0x0B57);
- (0x0B82, 0x0B83);
- (0x0BBE, 0x0BC2);
- (0x0BC6, 0x0BC8);
- (0x0BCA, 0x0BCD);
- (0x0BD7, 0x0BD7);
- (0x0C01, 0x0C03);
- (0x0C3E, 0x0C44);
- (0x0C46, 0x0C48);
- (0x0C4A, 0x0C4D);
- (0x0C55, 0x0C56);
- (0x0C82, 0x0C83);
- (0x0CBE, 0x0CC4);
- (0x0CC6, 0x0CC8);
- (0x0CCA, 0x0CCD);
- (0x0CD5, 0x0CD6);
- (0x0D02, 0x0D03);
- (0x0D3E, 0x0D43);
- (0x0D46, 0x0D48);
- (0x0D4A, 0x0D4D);
- (0x0D57, 0x0D57);
- (0x0E31, 0x0E31);
- (0x0E34, 0x0E3A);
- (0x0E47, 0x0E4E);
- (0x0EB1, 0x0EB1);
- (0x0EB4, 0x0EB9);
- (0x0EBB, 0x0EBC);
- (0x0EC8, 0x0ECD);
- (0x0F18, 0x0F19);
- (0x0F35, 0x0F35);
- (0x0F37, 0x0F37);
- (0x0F39, 0x0F39);
- (0x0F3E, 0x0F3E);
- (0x0F3F, 0x0F3F);
- (0x0F71, 0x0F84);
- (0x0F86, 0x0F8B);
- (0x0F90, 0x0F95);
- (0x0F97, 0x0F97);
- (0x0F99, 0x0FAD);
- (0x0FB1, 0x0FB7);
- (0x0FB9, 0x0FB9);
- (0x20D0, 0x20DC);
- (0x20E1, 0x20E1);
- (0x302A, 0x302F);
- (0x3099, 0x3099);
- (0x309A, 0x309A);
- ]
-
-let digit =
- [
- (0x0030, 0x0039);
- (0x0660, 0x0669);
- (0x06F0, 0x06F9);
- (0x0966, 0x096F);
- (0x09E6, 0x09EF);
- (0x0A66, 0x0A6F);
- (0x0AE6, 0x0AEF);
- (0x0B66, 0x0B6F);
- (0x0BE7, 0x0BEF);
- (0x0C66, 0x0C6F);
- (0x0CE6, 0x0CEF);
- (0x0D66, 0x0D6F);
- (0x0E50, 0x0E59);
- (0x0ED0, 0x0ED9);
- (0x0F20, 0x0F29);
- ]
-
-let extender =
- [
- (0x00B7, 0x00B7);
- (0x02D0, 0x02D1);
- (0x0387, 0x0387);
- (0x0640, 0x0640);
- (0x0E46, 0x0E46);
- (0x0EC6, 0x0EC6);
- (0x3005, 0x3005);
- (0x3031, 0x3035);
- (0x309D, 0x309E);
- (0x30FC, 0x30FE);
- ]
-
-let blank = [(0x0009, 0x000A); (0x000D, 0x000D); (0x0020, 0x0020)]
-let letter = union base_char ideographic
-
-(* Letters to be used in identifiers, as specified
- by ISO ....
- Data provided by John M. Skaller *)
-let tr8876_ident_char =
- [
- (* ASCII *)
- (0x0041, 0x005a);
- (0x0061, 0x007a);
- (* Latin *)
- (0x00c0, 0x00d6);
- (0x00d8, 0x00f6);
- (0x00f8, 0x01f5);
- (0x01fa, 0x0217);
- (0x0250, 0x02a8);
- (* Greek *)
- (0x0384, 0x0384);
- (0x0388, 0x038a);
- (0x038c, 0x038c);
- (0x038e, 0x03a1);
- (0x03a3, 0x03ce);
- (0x03d0, 0x03d6);
- (0x03da, 0x03da);
- (0x03dc, 0x03dc);
- (0x03de, 0x03de);
- (0x03e0, 0x03e0);
- (0x03e2, 0x03f3);
- (* Cyrillic *)
- (0x0401, 0x040d);
- (0x040f, 0x044f);
- (0x0451, 0x045c);
- (0x045e, 0x0481);
- (0x0490, 0x04c4);
- (0x04c7, 0x04c4);
- (0x04cb, 0x04cc);
- (0x04d0, 0x04eb);
- (0x04ee, 0x04f5);
- (0x04f8, 0x04f9);
- (* Armenian *)
- (0x0531, 0x0556);
- (0x0561, 0x0587);
- (0x04d0, 0x04eb);
- (* Hebrew *)
- (0x05d0, 0x05ea);
- (0x05f0, 0x05f4);
- (* Arabic *)
- (0x0621, 0x063a);
- (0x0640, 0x0652);
- (0x0670, 0x06b7);
- (0x06ba, 0x06be);
- (0x06c0, 0x06ce);
- (0x06e5, 0x06e7);
- (* Devanagari *)
- (0x0905, 0x0939);
- (0x0958, 0x0962);
- (* Bengali *)
- (0x0985, 0x098c);
- (0x098f, 0x0990);
- (0x0993, 0x09a8);
- (0x09aa, 0x09b0);
- (0x09b2, 0x09b2);
- (0x09b6, 0x09b9);
- (0x09dc, 0x09dd);
- (0x09df, 0x09e1);
- (0x09f0, 0x09f1);
- (* Gurmukhi *)
- (0x0a05, 0x0a0a);
- (0x0a0f, 0x0a10);
- (0x0a13, 0x0a28);
- (0x0a2a, 0x0a30);
- (0x0a32, 0x0a33);
- (0x0a35, 0x0a36);
- (0x0a38, 0x0a39);
- (0x0a59, 0x0a5c);
- (0x0a5e, 0x0a5e);
- (* Gunjarati *)
- (0x0a85, 0x0a8b);
- (0x0a8d, 0x0a8d);
- (0x0a8f, 0x0a91);
- (0x0a93, 0x0aa8);
- (0x0aaa, 0x0ab0);
- (0x0ab2, 0x0ab3);
- (0x0ab5, 0x0ab9);
- (0x0ae0, 0x0ae0);
- (* Oriya *)
- (0x0b05, 0x0b0c);
- (0x0b0f, 0x0b10);
- (0x0b13, 0x0b28);
- (0x0b2a, 0x0b30);
- (0x0b32, 0x0b33);
- (0x0b36, 0x0b39);
- (0x0b5c, 0x0b5d);
- (0x0b5f, 0x0b61);
- (* Tamil *)
- (0x0b85, 0x0b8a);
- (0x0b8e, 0x0b90);
- (0x0b92, 0x0b95);
- (0x0b99, 0x0b9a);
- (0x0b9c, 0x0b9c);
- (0x0b9e, 0x0b9f);
- (0x0ba3, 0x0ba4);
- (0x0ba8, 0x0baa);
- (0x0bae, 0x0bb5);
- (0x0bb7, 0x0bb9);
- (* Telugu *)
- (0x0c05, 0x0c0c);
- (0x0c0e, 0x0c10);
- (0x0c12, 0x0c28);
- (0x0c2a, 0x0c33);
- (0x0c35, 0x0c39);
- (0x0c60, 0x0c61);
- (* Kannada *)
- (0x0c85, 0x0c8c);
- (0x0c8e, 0x0c90);
- (0x0c92, 0x0ca8);
- (0x0caa, 0x0cb3);
- (0x0cb5, 0x0cb9);
- (0x0ce0, 0x0ce1);
- (* Malayam *)
- (0x0d05, 0x0d0c);
- (0x0d0e, 0x0d10);
- (0x0d12, 0x0d28);
- (0x0d2a, 0x0d39);
- (0x0d60, 0x0d61);
- (* Thai *)
- (0x0e01, 0x0e30);
- (0x0e32, 0x0e33);
- (0x0e40, 0x0e46);
- (0x0e4f, 0x0e5b);
- (* Lao *)
- (0x0e81, 0x0e82);
- (0x0e84, 0x0e84);
- (0x0e87, 0x0e88);
- (0x0e8a, 0x0e8a);
- (0x0e0d, 0x0e0d);
- (0x0e94, 0x0e97);
- (0x0e99, 0x0e9f);
- (0x0ea1, 0x0ea3);
- (0x0ea5, 0x0ea5);
- (0x0ea7, 0x0ea7);
- (0x0eaa, 0x0eab);
- (0x0ead, 0x0eb0);
- (0x0eb2, 0x0eb3);
- (0x0ebd, 0x0ebd);
- (0x0ec0, 0x0ec4);
- (0x0ec6, 0x0ec6);
- (* Georgian *)
- (0x10a0, 0x10c5);
- (0x10d0, 0x10f6);
- (* Hangul Jamo *)
- (0x1100, 0x1159);
- (0x1161, 0x11a2);
- (0x11a8, 0x11f9);
- (0x11d0, 0x11f6);
- (* Latin extensions *)
- (0x1e00, 0x1e9a);
- (0x1ea0, 0x1ef9);
- (* Greek extended *)
- (0x1f00, 0x1f15);
- (0x1f18, 0x1f1d);
- (0x1f20, 0x1f45);
- (0x1f48, 0x1f4d);
- (0x1f50, 0x1f57);
- (0x1f59, 0x1f59);
- (0x1f5b, 0x1f5b);
- (0x1f5d, 0x1f5d);
- (0x1f5f, 0x1f7d);
- (0x1f80, 0x1fb4);
- (0x1fb6, 0x1fbc);
- (0x1fc2, 0x1fc4);
- (0x1fc6, 0x1fcc);
- (0x1fd0, 0x1fd3);
- (0x1fd6, 0x1fdb);
- (0x1fe0, 0x1fec);
- (0x1ff2, 0x1ff4);
- (0x1ff6, 0x1ffc);
- (* Hiragana *)
- (0x3041, 0x3094);
- (0x309b, 0x309e);
- (* Katakana *)
- (0x30a1, 0x30fe);
- (* Bopmofo *)
- (0x3105, 0x312c);
- (* CJK Unified Ideographs *)
- (0x4e00, 0x9fa5);
- (* CJK Compatibility Ideographs *)
- (0xf900, 0xfa2d);
- (* Arabic Presentation Forms *)
- (0xfb1f, 0xfb36);
- (0xfb38, 0xfb3c);
- (0xfb3e, 0xfb3e);
- (0xfb40, 0xfb41);
- (0xfb42, 0xfb44);
- (0xfb46, 0xfbb1);
- (0xfbd3, 0xfd35);
- (* Arabic Presentation Forms-A *)
- (0xfd50, 0xfd85);
- (0xfd92, 0xfbc7);
- (0xfdf0, 0xfdfb);
- (* Arabic Presentation Forms-B *)
- (0xfe70, 0xfe72);
- (0xfe74, 0xfe74);
- (0xfe76, 0xfefc);
- (* Half width and Fullwidth Forms *)
- (0xff21, 0xff3a);
- (0xff41, 0xff5a);
- (0xff66, 0xffbe);
- (0xffc2, 0xffc7);
- (0xffca, 0xffcf);
- (0xffd2, 0xffd7);
- (0xffd2, 0xffd7);
- (0xffda, 0xffdc);
- ]
+include Sedlex_utils.Cset