diff options
author | Stephane Glondu <steph@glondu.net> | 2022-02-20 17:37:27 +0100 |
---|---|---|
committer | Stephane Glondu <steph@glondu.net> | 2022-02-20 17:37:27 +0100 |
commit | 084bfcb45334fdd9cbc7ffec804b225fae82ad56 (patch) | |
tree | 4250fa700ab952f990706f92e56955599564dc6b | |
parent | 9db54116bb06f17fa8d8256490a91c13767ec1ae (diff) |
New upstream version 1.0.3
-rw-r--r-- | B0.ml | 62 | ||||
-rw-r--r-- | BRZO | 1 | ||||
-rw-r--r-- | CHANGES.md | 6 | ||||
-rw-r--r-- | LICENSE.md | 2 | ||||
-rw-r--r-- | README.md | 7 | ||||
-rw-r--r-- | _tags | 4 | ||||
-rw-r--r-- | doc/api.odocl | 1 | ||||
-rw-r--r-- | doc/index.mld | 17 | ||||
-rw-r--r-- | opam | 52 | ||||
-rw-r--r-- | pkg/META | 4 | ||||
-rw-r--r-- | src/uutf.ml | 7 | ||||
-rw-r--r-- | src/uutf.mli | 75 | ||||
-rw-r--r-- | test/test.ml | 5 | ||||
-rw-r--r-- | test/tests.itarget | 3 | ||||
-rw-r--r-- | test/utftrip.ml | 7 |
15 files changed, 176 insertions, 77 deletions
@@ -0,0 +1,62 @@ +open B0_kit.V000 +open B00_std + + +(* OCaml library names *) + +let uutf = B0_ocaml.libname "uutf" +let unix = B0_ocaml.libname "unix" +let cmdliner = B0_ocaml.libname "cmdliner" + +(* Libraries *) + +let uutf_lib = + let srcs = Fpath.[`Dir (v "src")] in + let requires = [] in + B0_ocaml.lib uutf ~doc:"The uutf library" ~srcs ~requires + +(* Tests *) + +let test = + let srcs = Fpath.[`File (v "test/test.ml")] in + let meta = B0_meta.(empty |> tag test) in + let requires = [ uutf ] in + B0_ocaml.exe "test" ~doc:"Test suite" ~srcs ~meta ~requires + +let utftrip = + let doc = "Recode UTF-{8,16,16LE,16BE} and latin1 from stdin to stdout" in + let srcs = Fpath.[`File (v "test/utftrip.ml")] in + let requires = [unix; uutf; cmdliner] in + B0_ocaml.exe "utftrip" ~doc ~srcs ~requires + +(* Packs *) + +let default = + let meta = + let open B0_meta in + empty + |> tag B0_opam.tag + |> add authors ["The uutf programmers"] + |> add maintainers ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"] + |> add homepage "https://erratique.ch/software/uutf" + |> add online_doc "https://erratique.ch/software/uutf/doc/" + |> add licenses ["ISC"] + |> add repo "git+https://erratique.ch/repos/uutf.git" + |> add issues "https://github.com/dbuenzli/uutf/issues" + |> add description_tags + ["unicode"; "text"; "utf-8"; "utf-16"; "codec"; "org:erratique"] + |> add B0_opam.Meta.depopts ["cmdliner", ""] + |> add B0_opam.Meta.conflicts + [ "cmdliner", {|< "0.9.8"|}] + |> add B0_opam.Meta.depends + [ "ocaml", {|>= "4.03.0"|}; + "ocamlfind", {|build|}; + "ocamlbuild", {|build|}; + "topkg", {|build & >= "1.0.3"|}; + ] + |> add B0_opam.Meta.build + {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" + "--with-cmdliner" "%{cmdliner:installed}%"]]|} + in + B0_pack.v "default" ~doc:"uutf package" ~meta ~locked:true @@ + B0_unit.list () @@ -0,0 +1 @@ +(srcs-x pkg)
\ No newline at end of file @@ -1,3 +1,9 @@ +v1.0.3 2022-02-03 +----------------- + +- Support for OCaml 5.00, thanks to Kate (@kit-ty-kate) for + the patch. + v1.0.2 2019-02-05 La Forclaz (VS) --------------------------------- @@ -1,4 +1,4 @@ -Copyright (c) 2016 Daniel C. Bünzli +Copyright (c) 2016 The uutf programmers Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above @@ -1,6 +1,6 @@ Uutf — Non-blocking streaming Unicode codec for OCaml ------------------------------------------------------------------------------- -v1.0.2 +v1.0.3 Uutf is a non-blocking streaming codec to decode and encode the UTF-8, UTF-16, UTF-16LE and UTF-16BE encoding schemes. It can efficiently @@ -9,7 +9,8 @@ character position tracking and support newline normalization. Functions are also provided to fold over the characters of UTF encoded OCaml string values and to directly encode characters in OCaml -Buffer.t values. +Buffer.t values. **Note** that since OCaml 4.14, that functionality +can be found in the Stdlib and you are encouraged to migrate to it. Uutf has no dependency and is distributed under the ISC license. @@ -33,7 +34,7 @@ The documentation and API reference is automatically generated by from the source interfaces. It can be consulted [online][doc] or via `odig doc uutf`. -[doc]: http://erratique.ch/software/uutf/doc/Uutf +[doc]: http://erratique.ch/software/uutf/doc/ ## Sample programs @@ -1,7 +1,7 @@ -true : bin_annot, safe_string, package(bytes), package(uchar) +true : bin_annot, safe_string +<_b0> : -traverse <src> : include - <test> : include <test/utftrip.*> : package(unix), package(cmdliner) <test/examples.*> : package(unix)
\ No newline at end of file diff --git a/doc/api.odocl b/doc/api.odocl deleted file mode 100644 index 4b879fc..0000000 --- a/doc/api.odocl +++ /dev/null @@ -1 +0,0 @@ -Uutf diff --git a/doc/index.mld b/doc/index.mld new file mode 100644 index 0000000..1ba1460 --- /dev/null +++ b/doc/index.mld @@ -0,0 +1,17 @@ +{0 Uutf {%html: <span class="version">v1.0.3</span>%}} + +Uutf is a non-blocking streaming codec to decode and encode the UTF-8, +UTF-16, UTF-16LE and UTF-16BE encoding schemes. It can efficiently +work character by character without blocking on IO. Decoders perform +character position tracking and support newline normalization. + +Functions are also provided to fold over the characters of UTF encoded +OCaml string values and to directly encode characters in OCaml +{!Buffer.t} values. {b Note} that since OCaml 4.14, that functionality +can be found in the Stdlib and you are encouraged to migrate to it. + +{1:api API} + +{!modules: +Uutf +} @@ -1,23 +1,35 @@ -version: "1.0.2" +version: "1.0.3" opam-version: "2.0" -maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>" -authors: ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"] -homepage: "http://erratique.ch/software/uutf" -doc: "http://erratique.ch/software/uutf/doc/Uutf" -dev-repo: "git+http://erratique.ch/repos/uutf.git" +name: "uutf" +synopsis: """Non-blocking streaming Unicode codec for OCaml""" +maintainer: ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"] +authors: ["The uutf programmers"] +homepage: "https://erratique.ch/software/uutf" +doc: "https://erratique.ch/software/uutf/doc/" +dev-repo: "git+https://erratique.ch/repos/uutf.git" bug-reports: "https://github.com/dbuenzli/uutf/issues" -tags: [ "unicode" "text" "utf-8" "utf-16" "codec" "org:erratique" ] -license: "ISC" -depends: [ - "ocaml" {>= "4.01.0"} - "ocamlfind" {build} - "ocamlbuild" {build} - "topkg" {build} - "uchar" -] +license: ["ISC"] +tags: ["unicode" "text" "utf-8" "utf-16" "codec" "org:erratique"] +depends: ["ocaml" {>= "4.03.0"} + "ocamlfind" {build} + "ocamlbuild" {build} + "topkg" {build & >= "1.0.3"}] depopts: ["cmdliner"] -conflicts: ["cmdliner" { < "0.9.6"} ] -build: [[ - "ocaml" "pkg/pkg.ml" "build" - "--pinned" "%{pinned}%" - "--with-cmdliner" "%{cmdliner:installed}%" ]]
\ No newline at end of file +conflicts: ["cmdliner" {< "0.9.8"}] +build: [["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" + "--with-cmdliner" "%{cmdliner:installed}%"]] +description: """ +Uutf is a non-blocking streaming codec to decode and encode the UTF-8, +UTF-16, UTF-16LE and UTF-16BE encoding schemes. It can efficiently +work character by character without blocking on IO. Decoders perform +character position tracking and support newline normalization. + +Functions are also provided to fold over the characters of UTF encoded +OCaml string values and to directly encode characters in OCaml +Buffer.t values. **Note** that since OCaml 4.14, that functionality +can be found in the Stdlib and you are encouraged to migrate to. + +Uutf has no dependency and is distributed under the ISC license. + +Home page: http://erratique.ch/software/uutf +Contact: Daniel Bünzli `<daniel.buenzl i@erratique.ch>`""" @@ -1,6 +1,6 @@ -version = "1.0.2" +version = "1.0.3" description = "Non-blocking streaming Unicode codec for OCaml" -requires = "bytes uchar" +requires = "" archive(byte) = "uutf.cma" archive(native) = "uutf.cmxa" plugin(byte) = "uutf.cma" diff --git a/src/uutf.ml b/src/uutf.ml index e0fc60c..eafca5f 100644 --- a/src/uutf.ml +++ b/src/uutf.ml @@ -1,7 +1,6 @@ (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli. All rights reserved. + Copyright (c) 2012 The uutf programmers. All rights reserved. Distributed under the ISC license, see terms at the end of the file. - uutf v1.0.2 ---------------------------------------------------------------------------*) let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *) @@ -31,7 +30,7 @@ let u_rep = Uchar.unsafe_of_int 0xFFFD (* replacement character. *) type encoding = [ `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE ] type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ] -let encoding_of_string s = match String.uppercase s with (* IANA names. *) +let encoding_of_string s = match String.uppercase_ascii s with (* IANA names. *) | "UTF-8" -> Some `UTF_8 | "UTF-16" -> Some `UTF_16 | "UTF-16LE" -> Some `UTF_16LE @@ -807,7 +806,7 @@ module Buffer = struct end (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli + Copyright (c) 2012 The uutf programmers Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/src/uutf.mli b/src/uutf.mli index c63fc19..9c34820 100644 --- a/src/uutf.mli +++ b/src/uutf.mli @@ -1,7 +1,6 @@ (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli. All rights reserved. + Copyright (c) 2012 The uutf programmers. All rights reserved. Distributed under the ISC license, see terms at the end of the file. - uutf v1.0.2 ---------------------------------------------------------------------------*) (** Non-blocking streaming Unicode codec. @@ -13,32 +12,34 @@ character without blocking on IO. Decoders perform character position tracking and support {{!nln}newline normalization}. - Functions are also provided to {{!String} fold over} the - characters of UTF encoded OCaml string values and to - {{!Buffer}directly encode} characters in OCaml {!Buffer.t} - values. + Functions are also provided to {{!String} fold over} the characters + of UTF encoded OCaml string values and to {{!Buffer}directly encode} + characters in OCaml {!Stdlib.Buffer.t} values. {b Note} that since OCaml + 4.14, that functionality can be found in {!Stdlib.String} and + {!Stdlib.Buffer} and you are encouraged to migrate to it. See {{:#examples}examples} of use. - {e v1.0.2 — {{:http://erratique.ch/software/uutf }homepage}} - - {3 References} - {ul - {- The Unicode Consortium. - {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. - (latest version)}} + {b References} + {ul + {- The Unicode Consortium. + {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. + (latest version)}} *) (** {1:ucharcsts Special Unicode characters} *) val u_bom : Uchar.t (** [u_bom] is the {{:http://unicode.org/glossary/#byte_order_mark}byte - order mark} (BOM) character ([U+FEFF]). *) + order mark} (BOM) character ([U+FEFF]). From OCaml 4.06 on, use + {!Uchar.bom}. *) val u_rep : Uchar.t (** [u_rep] is the {{:http://unicode.org/glossary/#replacement_character}replacement} - character ([U+FFFD]). *) + character ([U+FFFD]). From OCaml 4.06 on, use + {!Uchar.rep}. *) + (** {1:schemes Unicode encoding schemes} *) @@ -121,7 +122,7 @@ val decoder : ?nln:[< nln] -> ?encoding:[< decoder_encoding] -> [< src] -> can only be [`UTF_8], [`UTF_16BE] or [`UTF_16LE]. The heuristic looks at the first three bytes of input (or less if impossible) and takes the {e first} matching byte pattern in the table below. -{[ +{v xx = any byte .. = any byte or no byte (input too small) pp = positive byte @@ -137,7 +138,7 @@ pp 00 .. | `UTF_16LE | ASCII UTF-16LE and U+0000 is often forbidden uu .. .. | `UTF_8 | ASCII UTF-8 or valid UTF-8 first byte. xx xx .. | `UTF_16BE | Not UTF-8 => UTF-16, no BOM => UTF-16BE .. .. .. | `UTF_8 | Single malformed UTF-8 byte or no input. -]} +v} This heuristic is compatible both with BOM based recognitition and {{:http://tools.ietf.org/html/rfc4627#section-3}JSON-like encoding @@ -153,12 +154,12 @@ xx xx .. | `UTF_16BE | Not UTF-8 => UTF-16, no BOM => UTF-16BE and character count of the last decoded character (including [`Malformed] ones) are respectively returned by {!decoder_line}, {!decoder_col}, {!decoder_byte_count} and {!decoder_count}. Before - the first call to {!decode} the line number is [1] and the column - is [0]. Each {!decode} returning [`Uchar] or [`Malformed] + the first call to {!val-decode} the line number is [1] and the column + is [0]. Each {!val-decode} returning [`Uchar] or [`Malformed] increments the column until a newline. On a newline, the line number is incremented and the column set to zero. For example the line is [2] and column [0] after the first newline was - decoded. This can be understood as if {!decode} was moving an + decoded. This can be understood as if {!val-decode} was moving an insertion point to the right in the data. A {e newline} is anything normalized by [`Readline], see {!nln}. @@ -205,7 +206,7 @@ val set_decoder_encoding : decoder -> [< decoder_encoding] -> unit (** [set_decoder_encoding d enc] changes the decoded encoding to [enc] after decoding started. - {b Warning.} Call only after {!decode} was called on [d] and that the + {b Warning.} Call only after {!val-decode} was called on [d] and that the last call to it returned something different from [`Await] or data may be lost. After encoding guess wait for at least three [`Uchar]s. *) @@ -213,25 +214,25 @@ val set_decoder_encoding : decoder -> [< decoder_encoding] -> unit val decoder_line : decoder -> int (** [decoder_line d] is the line number of the last - decoded (or malformed) character. See {!decoder} for details. *) + decoded (or malformed) character. See {!val-decoder} for details. *) val decoder_col : decoder -> int (** [decoder_col d] is the column number of the last decoded - (or malformed) character. See {!decoder} for details. *) + (or malformed) character. See {!val-decoder} for details. *) val decoder_byte_count : decoder -> int (** [decoder_byte_count d] is the number of bytes already decoded on - [d] (including malformed ones). This is the last {!decode}'s + [d] (including malformed ones). This is the last {!val-decode}'s end byte offset counting from the beginning of the stream. *) val decoder_count : decoder -> int (** [decoder_count d] is the number of characters already decoded on [d] - (including malformed ones). See {!decoder} for details. *) + (including malformed ones). See {!val-decoder} for details. *) val decoder_removed_bom : decoder -> bool (** [decoder_removed_bom d] is [true] iff an {e initial} {{:http://unicode.org/glossary/#byte_order_mark}BOM} was - removed from the input stream. See {!decoder} for details. *) + removed from the input stream. See {!val-decoder} for details. *) val decoder_src : decoder -> src (** [decoder_src d] is [d]'s input source. *) @@ -267,7 +268,7 @@ val encode : {ul {- [`Partial] iff [e] has a [`Manual] destination and needs more output storage. The client must use {!Manual.dst} to provide a new buffer - and then call {!encode} with [`Await] until [`Ok] is returned.} + and then call {!val-encode} with [`Await] until [`Ok] is returned.} {- [`Ok] when the encoder is ready to encode a new [`Uchar] or [`End]}} For [`Manual] destination, encoding [`End] always returns @@ -293,15 +294,15 @@ val encoder_dst : encoder -> dst module Manual : sig val src : decoder -> Bytes.t -> int -> int -> unit (** [src d s j l] provides [d] with [l] bytes to read, starting at - [j] in [s]. This byte range is read by calls to {!decode} with [d] + [j] in [s]. This byte range is read by calls to {!val-decode} with [d] until [`Await] is returned. To signal the end of input call the function with [l = 0]. *) val dst : encoder -> Bytes.t -> int -> int -> unit (** [dst e s j l] provides [e] with [l] bytes to write, starting - at [j] in [s]. This byte range is written by calls to {!encode} with [e] - until [`Partial] is returned. Use {!dst_rem} to know the remaining - number of non-written free bytes in [s]. *) + at [j] in [s]. This byte range is written by calls to + {!val-encode} with [e] until [`Partial] is returned. Use {!dst_rem} to + know the remaining number of non-written free bytes in [s]. *) val dst_rem : encoder -> int (** [dst_rem e] is the remaining number of non-written, free bytes @@ -310,7 +311,10 @@ end (** {1:strbuf String folders and Buffer encoders} *) -(** Fold over the characters of UTF encoded OCaml [string] values. *) +(** Fold over the characters of UTF encoded OCaml [string] values. + + {b Note.} Since OCaml 4.14, UTF decoders are available in + {!Stdlib.String}. You are encouraged to migrate to them. *) module String : sig (** {1 Encoding guess} *) @@ -358,7 +362,10 @@ module String : sig [String.length s - pos]. *) end -(** UTF encode characters in OCaml {!Buffer.t} values. *) +(** UTF encode characters in OCaml {!Buffer.t} values. + + {b Note.} Since OCaml 4.06, these encoders are available in + {!Stdlib.Buffer}. You are encouraged to migrate to them. *) module Buffer : sig (** {1 Buffer encoders} *) @@ -487,7 +494,7 @@ end *) (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli + Copyright (c) 2012 The uutf programmers Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/test/test.ml b/test/test.ml index 57be950..0f622a6 100644 --- a/test/test.ml +++ b/test/test.ml @@ -1,7 +1,6 @@ (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli. All rights reserved. + Copyright (c) 2012 The uutf programmers. All rights reserved. Distributed under the ISC license, see terms at the end of the file. - uutf v1.0.2 ---------------------------------------------------------------------------*) let u_nl = Uchar.of_int 0x000A @@ -377,7 +376,7 @@ let test () = let () = if not (!Sys.interactive) then test () (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli + Copyright (c) 2012 The uutf programmers Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/test/tests.itarget b/test/tests.itarget deleted file mode 100644 index 7b23501..0000000 --- a/test/tests.itarget +++ /dev/null @@ -1,3 +0,0 @@ -test.native -examples.native -utftrip.native
\ No newline at end of file diff --git a/test/utftrip.ml b/test/utftrip.ml index 714af74..627bf0c 100644 --- a/test/utftrip.ml +++ b/test/utftrip.ml @@ -1,7 +1,6 @@ (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli. All rights reserved. + Copyright (c) 2012 The uutf programmers. All rights reserved. Distributed under the ISC license, see terms at the end of the file. - uutf v1.0.2 ---------------------------------------------------------------------------*) let str = Printf.sprintf @@ -385,14 +384,14 @@ let cmd = in Term.(pure do_cmd $ cmd $ file $ sin $ sout $ use_unix $ usize $ ienc $ oenc $ nln $ rseed $ rcount), - Term.info "utftrip" ~version:"v1.0.2" ~doc ~man + Term.info "utftrip" ~version:"v1.0.3" ~doc ~man let () = match Term.eval cmd with | `Error _ -> exit 1 | _ -> if !input_malformed then exit 2 else exit 0 (*--------------------------------------------------------------------------- - Copyright (c) 2012 Daniel C. Bünzli + Copyright (c) 2012 The uutf programmers Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above |