1 files changed, 52 insertions, 14 deletions
diff --git a/src/lib/sedlexing.mli b/src/lib/sedlexing.mli
index 9df2f17..b129ff1 100644
--- a/src/lib/sedlexing.mli
+++ b/src/lib/sedlexing.mli
@@ -45,27 +45,37 @@ exception MalFormed
         Uchars [a], a position [pos] and a code point count [n].  The
         function should put [n] code points or less in [a], starting at
         position [pos], and return the number of characters provided. A
-        return value of 0 means end of input. *)
-val create : (Uchar.t array -> int -> int -> int) -> lexbuf
-
-(** set the initial tracked input position for [lexbuf].
-        If set to [Lexing.dummy_pos], Sedlexing will not track position
-        information for you. *)
-val set_position : lexbuf -> Lexing.position -> unit
+        return value of 0 means end of input. [bytes_per_char] argument is
+        optional. If unspecified, byte positions are the same as code point
+        position. *)
+val create :
+  ?bytes_per_char:(Uchar.t -> int) ->
+  (Uchar.t array -> int -> int -> int) ->
+  lexbuf
+
+(** set the initial tracked input position, in code point, for [lexbuf].
+        If unspecified, byte postion is set to the same value as code
+        point position. *)
+val set_position :
+  ?bytes_position:Lexing.position -> lexbuf -> Lexing.position -> unit
 
 (** [set_filename lexbuf file] sets the filename to [file] in
         [lexbuf]. It also sets the {!Lexing.pos_fname} field in
         returned {!Lexing.position} records. *)
 val set_filename : lexbuf -> string -> unit
 
-(** Create a lexbuf from a stream of Unicode code points. *)
-val from_gen : Uchar.t Gen.t -> lexbuf
+(** Create a lexbuf from a stream of Unicode code points. [bytes_per_char] is
+    optional. If unspecified, byte positions are the same as code point positions. *)
+val from_gen : ?bytes_per_char:(Uchar.t -> int) -> Uchar.t Gen.t -> lexbuf
 
-(** Create a lexbuf from an array of Unicode code points. *)
-val from_int_array : int array -> lexbuf
+(** Create a lexbuf from an array of Unicode code points. [bytes_per_char] is
+    optional. If unspecified, byte positions are the same as code point positions. *)
+val from_int_array : ?bytes_per_char:(Uchar.t -> int) -> int array -> lexbuf
 
-(** Create a lexbuf from an array of Unicode code points. *)
-val from_uchar_array : Uchar.t array -> lexbuf
+(** Create a lexbuf from an array of Unicode code points. [bytes_per_char] is
+    optional. If unspecified, byte positions are the same as code point positions. *)
+val from_uchar_array :
+  ?bytes_per_char:(Uchar.t -> int) -> Uchar.t array -> lexbuf
 
 (** {6 Interface for lexers semantic actions} *)
 
@@ -78,29 +88,57 @@ val from_uchar_array : Uchar.t array -> lexbuf
         The first code point of the stream has offset 0. *)
 val lexeme_start : lexbuf -> int
 
+(** [Sedlexing.lexeme_start lexbuf] returns the offset in the
+        input stream of the first byte of the matched string.
+        The first code point of the stream has offset 0. *)
+val lexeme_bytes_start : lexbuf -> int
+
 (** [Sedlexing.lexeme_end lexbuf] returns the offset in the input
         stream of the character following the last code point of the
         matched string. The first character of the stream has offset
         0. *)
 val lexeme_end : lexbuf -> int
 
+(** [Sedlexing.lexeme_end lexbuf] returns the offset in the input
+        stream of the byte following the last code point of the
+        matched string. The first character of the stream has offset
+        0. *)
+val lexeme_bytes_end : lexbuf -> int
+
 (** [Sedlexing.loc lexbuf] returns the pair
         [(Sedlexing.lexeme_start lexbuf,Sedlexing.lexeme_end
         lexbuf)]. *)
 val loc : lexbuf -> int * int
 
+(** [Sedlexing.bytes_loc lexbuf] returns the pair
+        [(Sedlexing.lexeme_bytes_start lexbuf,Sedlexing.lexeme_bytes_end
+        lexbuf)]. *)
+val bytes_loc : lexbuf -> int * int
+
 (** [Sedlexing.lexeme_length lexbuf] returns the difference
         [(Sedlexing.lexeme_end lexbuf) - (Sedlexing.lexeme_start
         lexbuf)], that is, the length (in code points) of the matched
         string. *)
 val lexeme_length : lexbuf -> int
 
+(** [Sedlexing.lexeme_length lexbuf] returns the difference
+        [(Sedlexing.lexeme_bytes_end lexbuf) - (Sedlexing.lexeme_bytes_start
+        lexbuf)], that is, the length (in bytes) of the matched
+        string. *)
+val lexeme_bytes_length : lexbuf -> int
+
 (** [Sedlexing.lexing_positions lexbuf] returns the start and end
-        positions of the current token, using a record of type
+        positions, in code points, of the current token, using a record of type
         [Lexing.position]. This is intended for consumption
         by parsers like those generated by [Menhir]. *)
 val lexing_positions : lexbuf -> Lexing.position * Lexing.position
 
+(** [Sedlexing.lexing_bytes_positions lexbuf] returns the start and end
+        positions, in bytes, of the current token, using a record of type
+        [Lexing.position]. This is intended for consumption
+        by parsers like those generated by [Menhir]. *)
+val lexing_bytes_positions : lexbuf -> Lexing.position * Lexing.position
+
 (** [Sedlexing.new_line lexbuf] increments the line count and
         sets the beginning of line to the current position, as though
         a newline character had been encountered in the input. *)