summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Cowgill <jcowgill@debian.org>2023-11-29 19:50:06 +0000
committerJames Cowgill <jcowgill@debian.org>2023-11-29 19:50:06 +0000
commit1a23af3ab4ffcc045291cfa23f80d842edc4877d (patch)
treee96d8c8a4bce7ae5b391d24081b090bed5d1f466
parent8c56f68b6fb386c5df7e2b7c2e2e72d3f392c844 (diff)
parent46fbd223191890161d0409ee542ac6bc22537bf7 (diff)
Update upstream source from tag 'upstream/0.0.8'
Update to upstream version '0.0.8' with Debian dir e343cd97b840fe74c9b2e75b834a8faac2e35ef3
-rw-r--r--.gitignore37
-rw-r--r--CMakeLists.txt48
-rw-r--r--README.md80
-rw-r--r--doc/README.maintainer8
-rwxr-xr-xscript/BuildLangModel.py21
-rw-r--r--script/BuildLangModelLogs/LangDanishModel.log384
-rw-r--r--script/charsets/ibm865.py71
-rw-r--r--script/langs/da.py2
-rw-r--r--script/langs/no.py55
-rw-r--r--src/CMakeLists.txt19
-rw-r--r--src/LangModels/LangDanishModel.cpp174
-rw-r--r--src/LangModels/LangNorwegianModel.cpp323
-rw-r--r--src/nsSBCSGroupProber.cpp166
-rw-r--r--src/nsSBCSGroupProber.h2
-rw-r--r--src/nsSBCharSetProber.h6
-rw-r--r--src/tools/CMakeLists.txt19
-rw-r--r--src/tools/uchardet.cpp17
-rw-r--r--test/da/ibm865.txt5
-rw-r--r--test/mt/iso-8859-3.txt5
-rw-r--r--test/no/ibm865.txt17
-rw-r--r--test/no/iso-8859-1.txt20
-rw-r--r--test/no/iso-8859-15.txt21
-rw-r--r--test/no/utf-8.txt20
-rw-r--r--test/no/windows-1252.txt21
-rw-r--r--test/uchardet-tests.c7
-rw-r--r--uchardet-config.cmake.in19
26 files changed, 1241 insertions, 326 deletions
diff --git a/.gitignore b/.gitignore
index c18dd8d..a7bb970 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,38 @@
__pycache__/
+
+# CMake files
+CMakeCache.txt
+CMakeFiles/
+CTestTestfile.cmake
+cmake_install.cmake
+
+# With make generator
+Makefile
+
+# With ninja generator
+.ninja_deps
+.ninja_log
+build.ninja
+
+# Built files
+uchardet-config-version.cmake
+uchardet-config.cmake
+uchardet-targets.cmake
+uchardet.pc
+src/version.script
+
+# Build binaries
+src/libuchardet.a
+src/libuchardet.so*
+
+src/tools/uchardet
+test/uchardet-tests
+
+# For Windows (untested)
+src/libuchardet.dll
+
+src/tools/uchardet.exe
+test/uchardet-tests.exe
+
+# For macOS (untested)
+src/libuchardet.dylib
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d2038d..a570264 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
######## Project settings
-cmake_minimum_required(VERSION 2.8.5)
+cmake_minimum_required(VERSION 3.1)
include(CheckCCompilerFlag)
set (PACKAGE_NAME uchardet)
project (${PACKAGE_NAME} CXX C)
@@ -10,7 +10,7 @@ set (PACKAGE_URL https://www.freedesktop.org/wiki/Software/uchardet/)
set (PACKAGE_BUGREPORT https://gitlab.freedesktop.org/uchardet/uchardet/-/issues)
set (UCHARDET_VERSION_MAJOR 0)
set (UCHARDET_VERSION_MINOR 0)
-set (UCHARDET_VERSION_REVISION 7)
+set (UCHARDET_VERSION_REVISION 8)
if (CMAKE_BUILD_TYPE MATCHES Debug)
set (version_suffix .debug)
@@ -39,10 +39,10 @@ if (BUILD_SHARED_LIBS)
endif (BUILD_SHARED_LIBS)
if (TARGET_ARCHITECTURE STREQUAL "")
- string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} TARGET_ARCHITECTURE)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" TARGET_ARCHITECTURE)
endif (TARGET_ARCHITECTURE STREQUAL "")
-if (TARGET_ARCHITECTURE MATCHES ".*(x86)|(amd).*")
+if (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*")
CHECK_C_COMPILER_FLAG(-msse2 SUPPORTS_CFLAG_SSE2)
CHECK_C_COMPILER_FLAG(-mfpmath=sse SUPPORTS_CFLAG_SSE_MATH)
if (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH)
@@ -52,7 +52,7 @@ if (TARGET_ARCHITECTURE MATCHES ".*(x86)|(amd).*")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffloat-store")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffloat-store")
endif (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH)
-endif (TARGET_ARCHITECTURE MATCHES ".*(x86)|(amd).*")
+endif (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*")
configure_file(
uchardet.pc.in
@@ -62,7 +62,7 @@ configure_file(
install(
FILES
- ${CMAKE_BINARY_DIR}/uchardet.pc
+ ${CMAKE_CURRENT_BINARY_DIR}/uchardet.pc
DESTINATION
${CMAKE_INSTALL_LIBDIR}/pkgconfig
)
@@ -72,3 +72,39 @@ install(
add_subdirectory(src)
add_subdirectory(doc)
add_subdirectory(test)
+
+######## Exported targets
+
+install(
+ EXPORT UchardetTargets
+ FILE ${PACKAGE_NAME}-targets.cmake
+ NAMESPACE ${PACKAGE_NAME}::
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME}
+)
+
+export(
+ EXPORT UchardetTargets
+ FILE "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-targets.cmake"
+ NAMESPACE ${PACKAGE_NAME}::
+)
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+ ${PACKAGE_NAME}-config-version.cmake
+ VERSION ${UCHARDET_VERSION}
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_file(
+ ${PACKAGE_NAME}-config.cmake.in
+ ${PACKAGE_NAME}-config.cmake
+ @ONLY
+)
+
+install (
+ FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake"
+ DESTINATION
+ ${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME}
+)
diff --git a/README.md b/README.md
index a2713ae..f6a7424 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,6 @@
uchardet started as a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. It can now detect more charsets, and more reliably than the original implementation.
-The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
-
-Techniques used by universalchardet are described at http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
-
## Supported Languages/Encodings
* International (Unicode)
@@ -39,6 +35,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* IBM852
* MAC-CENTRALEUROPE
* Danish
+ * IBM865
* ISO-8859-1
* ISO-8859-15
* WINDOWS-1252
@@ -103,6 +100,11 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* ISO-8859-13
* Maltese
* ISO-8859-3
+ * Norwegian
+ * IBM865
+ * ISO-8859-1
+ * ISO-8859-15
+ * WINDOWS-1252
* Polish:
* ISO-8859-2
* ISO-8859-13
@@ -181,6 +183,10 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
### Mac
brew install uchardet
+
+ or
+
+ port install uchardet
### Windows
@@ -194,7 +200,8 @@ to use MinGW-w64 instead of MinGW, in particular to build both 32 and
64-bit DLL libraries).
Note also that it is very easily cross-buildable (for instance from a
-GNU/Linux machine).
+GNU/Linux machine; [crossroad](https://pypi.org/project/crossroad/) may
+help, this is what we use in our CI).
### Build from source
@@ -231,13 +238,38 @@ Here is a working "module" section to include in your Flatpak's json manifest:
]
```
+### Build with CMake exported targets
+
+uchardet installs a standard pkg-config file which will make it easily
+discoverable by any modern build system. Nevertheless if your project also uses
+CMake and you want to discover uchardet installation using CMake exported
+targets, you may find and link uchardet with:
+
+```
+project(sample LANGUAGES C)
+find_package ( uchardet )
+if (uchardet_FOUND)
+ add_executable( sample sample.c )
+ target_link_libraries ( sample PRIVATE uchardet::libuchardet )
+endif ()
+```
+
+Note though that we recommend the library discovery with `pkg-config` because it
+is standard and generic. Therefore it will always work, even if we decided to
+change our own build system (which is not planned right now, but may always
+happen). This is why we advise to use standard `pkg-config` discovery.
+
+Some more CMake specificities may be found in the [commit
+message](https://gitlab.freedesktop.org/uchardet/uchardet/-/commit/d7dad549bd5a3442b92e861bcd2c5cda2adeea27)
+which implemented such support.
+
## Usage
### Command Line
```
uchardet Command Line Tool
-Version 0.0.7
+Version 0.0.8
Authors: BYVoid, Jehan
Bug Report: https://gitlab.freedesktop.org/uchardet/uchardet/-/issues
@@ -254,8 +286,41 @@ Options:
See [uchardet.h](https://gitlab.freedesktop.org/uchardet/uchardet/-/blob/master/src/uchardet.h)
+## History
+
+As said in introduction, this was initially a project of Mozilla to
+allow better detection of page encodings, and it used to be part of
+Firefox. If not mistaken, this is not the case anymore (probably because
+nowadays most websites better announce their encoding, and also UTF-8 is
+much more widely spread).
+
+Techniques used by universalchardet are described at https://www-archive.mozilla.org/projects/intl/universalcharsetdetection
+
+It is to be noted that a lot has changed since the original code, yet
+the base concept is still around, basing detection not just on encoding
+rules, but importantly on analysis of character statistics in languages.
+
+Original code by Mozilla does not seem to be found anymore anywhere, but
+it's probably not too far from the initial commit of this repository.
+
+Mozilla code was extracted and packaged into a standalone library under
+the name `uchardet` by BYVoid in 2011, in a personal repository.
+Starting 2015, I (i.e. Jehan) started contributing, "standardized"
+the output to be iconv-compatible, added various encoding/language
+support and streamlined generation of sources for new support of
+encoding/languages by using texts from Wikipedia as statistics source on
+languages through Python scripts. Then I soon became co-maintainer.
+In 2016, `uchardet` became a freedesktop project.
+
## Related Projects
+Some of these are bindings of `uchardet`, others are forks of the same
+initial code, which has diverged over time, others are native port in
+other languages.
+This list is not exhaustive and only meant as point of interest. We
+don't follow the status for these projects.
+
+ * [R-uchardet](https://cran.r-project.org/package=uchardet) R binding on CRAN
* [python-chardet](https://github.com/chardet/chardet) Python port
* [ruby-rchardet](http://rubyforge.org/projects/chardet/) Ruby port
* [juniversalchardet](http://code.google.com/p/juniversalchardet/) Java port of universalchardet
@@ -264,7 +329,7 @@ See [uchardet.h](https://gitlab.freedesktop.org/uchardet/uchardet/-/blob/master/
* [nchardet](http://www.conceptdevelopment.net/Localization/NCharDet/) C# port of chardet
* [uchardet-enhanced](https://bitbucket.org/medoc/uchardet-enhanced) A fork of mozilla universalchardet
* [rust-uchardet](https://github.com/emk/rust-uchardet) Rust language binding of uchardet
- * [libchardet](https://ftp.oops.org/pub/oops/libchardet/) Another C/C++ API wrapping Mozilla code.
+ * [libchardet](https://github.com/Joungkyun/libchardet) Another C/C++ API wrapping Mozilla code.
## Used by
@@ -272,6 +337,7 @@ See [uchardet.h](https://gitlab.freedesktop.org/uchardet/uchardet/-/blob/master/
* [Tepl](https://wiki.gnome.org/Projects/Tepl)
* [Nextcloud IOS app](https://github.com/nextcloud/ios)
* [Codelite](https://codelite.org)
+* [QtAV](https://www.qtav.org/)
* …
## Licenses
diff --git a/doc/README.maintainer b/doc/README.maintainer
index 0bc52f9..4577615 100644
--- a/doc/README.maintainer
+++ b/doc/README.maintainer
@@ -48,8 +48,12 @@ Cf. EXAMPLES section in `git help archive`.
The archive and its checksum file should now be available from:
https://www.freedesktop.org/software/uchardet/releases/
+* Make the git tag into a Gitlab release (not automatic).
+ It will be found at: https://gitlab.freedesktop.org/uchardet/uchardet/-/tags/vx.y.z
+ Just click the "Edit release notes" button, and copy paste the tag comment as "release notes".
+
* Update the wiki page: https://www.freedesktop.org/wiki/Software/uchardet/
- The release note will be the tag content:
- https://cgit.freedesktop.org/uchardet/uchardet/tag/?h=vx.y.z
+ The release note link will be:
+ https://gitlab.freedesktop.org/uchardet/uchardet/-/releases/vx.y.z
* Spread the good news!
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 38ac793..43f975c 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -50,6 +50,7 @@ import requests
import sys
import re
import os
+import random
# Custom modules.
import charsets.db
@@ -240,12 +241,22 @@ def visit_pages(titles, depth, lang, logfd):
return
next_titles = []
+ if options.max_page is not None:
+ max_titles = int(options.max_page/(options.max_depth * options.max_depth))
+ else:
+ max_titles = sys.maxsize
for title in titles:
if options.max_page is not None and \
len(visited_pages) > options.max_page:
return
if title in visited_pages:
continue
+
+ # Ugly hack skipping internal pages
+ if 'wiki' in title or 'Wiki' in title:
+ print('Skipping', title)
+ continue
+
visited_pages += [title]
try:
page = wikipedia.page(title)
@@ -255,16 +266,22 @@ def visit_pages(titles, depth, lang, logfd):
print("Discarding page {}.\n".format(title))
continue
logfd.write("\n{} (revision {})".format(title, page.revision_id))
+ logfd.flush()
process_text(page.content, lang)
try:
- next_titles += page.links
+ links = page.links
+ random.shuffle(links)
+ if len(links) > max_titles:
+ links = links[:max_titles]
+ next_titles += links
except KeyError:
pass
if depth >= options.max_depth:
return
+ random.shuffle(next_titles)
visit_pages (next_titles, depth + 1, lang, logfd)
language_c = lang.name.replace('-', '_').title()
@@ -277,6 +294,7 @@ logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
if options.max_page is not None:
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
logfd.write('\n\n== Parsed pages ==\n')
+logfd.flush()
try:
visit_pages(lang.start_pages, 0, lang, logfd)
except requests.exceptions.ConnectionError:
@@ -284,6 +302,7 @@ except requests.exceptions.ConnectionError:
exit(1)
logfd.write('\n\n== End of Parsed pages ==')
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
+logfd.flush()
########### CHARACTERS ###########
diff --git a/script/BuildLangModelLogs/LangDanishModel.log b/script/BuildLangModelLogs/LangDanishModel.log
index cf183b3..14bf65d 100644
--- a/script/BuildLangModelLogs/LangDanishModel.log
+++ b/script/BuildLangModelLogs/LangDanishModel.log
@@ -1,158 +1,256 @@
= Logs of language model for Danish (da) =
- Generated by BuildLangModel.py
-- Started: 2016-02-19 17:53:58.564190
-- Maximum depth: 4
-- Max number of pages: 100
+- Started: 2022-11-30 19:37:01.097250
+- Maximum depth: 2
+- Max number of pages: 200
== Parsed pages ==
-Forside (revision 2692411)
-16. februar (revision 6877446)
-17. februar (revision 8454583)
-1878 (revision 8280505)
-19. februar (revision 8206479)
-1922 (revision 8455105)
-1926 (revision 8425271)
-1942 (revision 8443554)
-1945 (revision 8448461)
-1948 (revision 8454392)
-1985 (revision 8409096)
-2. verdenskrig (revision 8433181)
-23. oktober (revision 6877825)
-26. oktober (revision 7849938)
-3C 273 (revision 8443798)
-A-bus (revision 8427319)
-Aktuelle begivenheder (revision 8440596)
-B-52 Stratofortress (revision 8422571)
-Borgerkrigen i Syrien (revision 8447763)
-Boutros Boutros-Ghali (revision 8453935)
-Brasilien (revision 8452750)
-Cusco (region) (revision 7693764)
-Danmark (revision 8451178)
-Danmark i Eurovision Song Contest (revision 8453514)
-Dansk (sprog) (revision 8455750)
-Dansk Melodi Grand Prix 2016 (revision 8452164)
-Dobbeltmordet på Peter Bangs Vej (revision 8334648)
-Encyklopædi (revision 8446641)
-Eritrea-sagen (revision 8452285)
-Eurovision Song Contest 2014 (revision 8445804)
-Eurovision Song Contest 2016 (revision 8453588)
-Flygtningekrisen i Europa 2015 (revision 8452286)
-Fonograf (revision 8177165)
-Formel 1 (revision 8450846)
-Formel 1 2016 (revision 8456463)
-Frederik 6. (revision 8438503)
-Første observation af gravitationsbølger (revision 8451269)
-Grammofon (revision 8375093)
-Guadalcanal (revision 7796248)
-Harper Lee (revision 8456583)
-Hartkorn (revision 8437552)
-IC4 (revision 8446402)
-IC4-sagen (revision 8434463)
-Islamisk Stat (revision 8439228)
-Jonathan Leunbach (revision 8452603)
-Juliane Marie af Braunschweig-Wolfenbüttel (revision 8437957)
-Kaliumklorid (revision 8452216)
-Kejserriget Japan (revision 8044942)
-Kevin Magnussen (revision 8455302)
-København (revision 8427847)
-LIGO (revision 8451266)
-Latinamerika (revision 7692181)
-Leonid Hurwicz (revision 8445727)
-Lighthouse X (revision 8452940)
-Linkoban (revision 8455879)
-Machu Picchu (revision 8406907)
-Matador (tv-serie) (revision 8454648)
-Middelaldercentret (revision 8449194)
-Nobelprisen (revision 8409809)
-Nykøbing Falster (revision 8452825)
-Nyligt afdøde (revision 8456580)
-Overvågning (revision 8455039)
-Panorama (foto) (revision 8448393)
-Peru (revision 8437485)
-Peter Lauritsen (revision 8456097)
-Professor (revision 8415451)
-Renault F1 (revision 8450843)
-S-bus (revision 8455589)
-Salomonøerne (revision 8238961)
-Slaget om Belgien (1940) (revision 8430013)
-Slaget om Guadalcanal (revision 7762887)
-Slaget om Henderson Field (revision 8445480)
-Slaget om Iwo Jima (revision 8145239)
-Soldiers of Love (Lighthouse X-sang) (revision 8452929)
-Solen (revision 8276478)
-Stillehavskrigen (revision 8430649)
-Stockholm (revision 8358042)
-Søslaget ved Guadalcanal (revision 7772812)
-Thomas Edison (revision 8282441)
-Togulykken ved Bad Aibling (revision 8455364)
-Topografi (revision 6886168)
-USA (revision 8448088)
-United States Army (revision 8401635)
-United States Marine Corps (revision 8401667)
-Vestallierede (revision 6961443)
-Wikimedia (revision 8263252)
-Wikipedia (revision 8267051)
-Zikavirus (revision 8454832)
-1. februar (revision 8404985)
-10. februar (revision 6877431)
-11. februar (revision 6877433)
-12. februar (revision 6877437)
-13. februar (revision 6877438)
-14. februar (revision 6877441)
-1497 (revision 7369489)
-15. februar (revision 7329463)
-1560 (revision 7874693)
-1568 (revision 7369703)
-1620 (revision 7423903)
-1688 (revision 7367090)
-18. februar (revision 6877450)
+Forside (revision 10000691)
+Hans Magnus Enzensberger (revision 11341046)
+28. november (revision 9410945)
+Golfkrigen (revision 11144370)
+29. november (revision 6877900)
+8. december (revision 10277754)
+Det Konservative Folkeparti (revision 11313857)
+1990 (revision 11340072)
+1940 (revision 11263756)
+Angolas håndboldlandshold (damer) (revision 11331888)
+Skjoldvulkan (revision 10870812)
+Casper & Mandrilaftalen (revision 11221713)
+26. november (revision 10617630)
+Døde i 2022 (revision 11343986)
+Vikingetidens rustning og våben (revision 11332607)
+Middelaldercentret (revision 11339897)
+Ruslands invasion af Ukraine 2022 (revision 11335164)
+Saddam Hussein (revision 11002258)
+The Jimi Hendrix Experience (revision 10497780)
+Færøerne (revision 11333678)
+27. november (revision 9745974)
+Thomas Vinterberg (revision 11234643)
+Anwar Ibrahim (revision 11342876)
+Mandatområdet i Palæstina (revision 11341286)
+Kunst (revision 11336917)
+Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917)
+Dansk (sprog) (revision 11313509)
+Sergej Sjojgu (revision 11309097)
+Fernando Gomes (revision 11340427)
+Folketinget (revision 11330485)
+15. januar (revision 10515606)
+Rock and Roll Hall of Fame (revision 8408189)
+Thomas Edison (revision 11052704)
+Ukraine (revision 11334630)
+1947 (revision 11252357)
+1937 (revision 11303923)
+IC4 (revision 11317878)
+Jimi Hendrix (revision 11341476)
+Ismail Sabri Yaakob (revision 11105534)
+Okipa-ceremonien (revision 11340589)
+SI-præfiks (revision 11332802)
+Sporvejsmuseet Skjoldenæsholms historie (revision 11338275)
+Irak (revision 11255676)
+Woodstockfestivalen (revision 11226413)
+Nikolaj Lie Kaas (revision 11322663)
+Torben Rechendorff (revision 11342962)
+Folketingsvalget 2022 (revision 11339557)
+Kherson (revision 11314559)
+Keltere (revision 11318773)
+Little Richard (revision 11226619)
+Invasion (revision 10307980)
+Tate Gallery (revision 8312688)
+24. januar (revision 10441562)
+Hans Christian Ægidius (revision 9773029)
+Slaget ved Irpin (1321) (revision 11230064)
+Auschwitz (revision 11310714)
+Jazz fusion (revision 11223082)
+Lutsk (revision 11248429)
+Planetarium (revision 11266837)
+Bibliothèque nationale de France (revision 11055813)
+Digtsamling (revision 10585337)
+Kenneth Gøtterup (revision 11027437)
+Straf (revision 11007456)
+1716 (revision 11339928)
+Kamel (revision 11285016)
+Amnesti (revision 10831621)
+Zulu Royal (revision 10969220)
+Stephen Roche (revision 11239346)
+13. december (revision 10768225)
+Enhed (politisk parti) (revision 10158693)
+The Everly Brothers (revision 10865882)
+3. november (revision 9423371)
+Annelise Gotfredsen (revision 11306090)
+Virtual International Authority File (revision 8702589)
+Europæiske Fællesskab (revision 10868689)
+Væringer (revision 11331002)
+Rom (revision 11341285)
+Decentralisering (revision 11154770)
+Kreml (Moskva) (revision 11045482)
+Folketingsvalget 1994 (revision 11266325)
+28. december (revision 6878014)
+Østjyllands Storkreds (revision 11201505)
+Bruxelles (revision 10802416)
+Erik Haunstrup Clemmensen (revision 10627614)
+Hviderussere (revision 10750673)
+Hvidmelet GÃ¥sefod (revision 11317723)
+Mario Draghi (revision 11302527)
+Folketingsvalget 1968 (revision 11300317)
+Skudår (revision 10360386)
+1921 (revision 11303917)
+Rundkørsel (revision 11103019)
+Valerij Zaluzjnyj (revision 11335164)
+Angrebet på Pearl Harbor (revision 11309782)
+Folketingsmedlemmer valgt i 2007 (revision 11187293)
+Ingeniørvidenskab (revision 9816520)
+Vikinger (revision 11327511)
+Martin Luther King (revision 11320659)
+1757 (revision 11186195)
+Dieseltogsæt (revision 8177984)
+El-værk (revision 11334293)
+Soul (revision 11283982)
+John McVie (revision 11040471)
+Botswanas håndboldlandshold (herrer) (revision 11333322)
+1971 (revision 11243510)
+Rana Hussein (revision 11266594)
+DR (revision 11342995)
+Ewan McGregor (revision 11331681)
+Eliane Paulo (revision 10589121)
+Zepto- (revision 11332802)
+København (revision 11336925)
+Gallien (revision 9984925)
+Augustoprøret (revision 11234324)
+1991 (revision 11250037)
+Afledte SI-enheder (revision 11097802)
+Gemeinsame Normdatei (revision 11281765)
+Litteraturvidenskab (revision 10931878)
+Thorvald Stauning (revision 11107677)
+Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830)
+Folkeforbundet (revision 11315450)
+Readymades (revision 10932287)
+Al Anbar (revision 9458175)
+2007 (revision 11250033)
+Varieté (revision 10934358)
+Damaskus (revision 11030795)
+Palæstina (revision 11311424)
+1569 (revision 10832219)
+Pædagog (revision 11251603)
+Carina Christensen (revision 11073847)
+Vest-Tyskland (revision 10580737)
+20. november (revision 6877846)
+Tessa Jowell (revision 11225831)
+Hillerød (revision 11317306)
+PÃ¥skekrisen (1920) (revision 11287865)
+Grad (vinkelmål) (revision 9624298)
+Kvinde (revision 11333939)
+1931 (revision 11236350)
+Afrikaans (revision 11080347)
+Den Store Danske Encyklopædi (revision 11301417)
+22. juni (revision 10375853)
+Automatic Train Control (revision 10619401)
+Luc Montagnier (revision 11162267)
+Reprise Records (revision 11081843)
+1966 (revision 11336105)
+Prosa (skriveform) (revision 11236012)
+Michael af Rumænien (revision 10819975)
+Mykolajiv (revision 11236676)
+Khmelnytskyj oblast (revision 11188686)
+Sierra Leones håndboldlandshold (herrer) (revision 11333322)
+1969 (revision 11340081)
+H. Edvard Hass (revision 10348478)
+Københavns Idrætspark (revision 9400386)
+Sanna Nielsen (revision 11315712)
+19. maj (revision 7148596)
+Patricia Schumann (revision 10952761)
+Torstenssonfejden (revision 11326728)
+International Standard Name Identifier (revision 10880739)
+Bent Mejding (revision 11335462)
+Afdeling Q (revision 11279134)
+Alfred Bindslev (revision 10398140)
+Sakser (revision 9042633)
+Folketingsmedlemmer valgt i 1998 (revision 11213304)
+1996 (revision 11229565)
+1 (tal) (revision 9378579)
+Farrah Fawcett (revision 10977527)
+Google+ (revision 10469085)
+1530 (revision 10865231)
+De største helte (revision 10737852)
+Afrikamesterskabet i håndbold 1974 (mænd) (revision 11018946)
+1902 (revision 11217211)
+ISO 639-3 (revision 10880691)
+1974 (revision 11336110)
+Dansk fonologi (revision 11226101)
+Europa (revision 11149054)
+Sovemedicin (revision 11327388)
+Slotsbryggen (Nykøbing Falster) (revision 11005548)
+Olieraffinaderi (revision 11322152)
+Slaget ved Stiklestad (revision 11261889)
+Rolling Stone (revision 11267586)
+Jørgen Hald (revision 10296412)
+Nikolaj Coster-Waldau (revision 11228953)
+Aserbajdsjan (revision 11297538)
+Kultstatus (revision 7820159)
+Al Kut (revision 9425606)
+Library of Congress Control Number (revision 8316539)
+Rwandas håndboldlandshold (herrer) (revision 11333322)
+Levon Helm (revision 11317127)
+Howard Hughes (revision 11040881)
+Wim Kieft (revision 10910953)
+Afrikamesterskabet i håndbold 2016 (mænd) (revision 11018957)
+24. februar (revision 10755036)
+Iværksætter (revision 10972242)
+1992 (revision 11303945)
+Internationalt Standardbognummer (revision 11037702)
+Afrikamesterskabet i håndbold 2014 (mænd) (revision 11018956)
+En mand kommer hjem (revision 10737861)
+Jamaica (revision 11243987)
+Henitjesk (revision 11328921)
+August (revision 11210562)
== End of Parsed pages ==
-- Wikipedia parsing ended at: 2016-02-19 17:56:42.162636
+- Wikipedia parsing ended at: 2022-11-30 19:41:17.518631
-53 characters appeared 1301488 times.
+60 characters appeared 1532370 times.
-First 30 characters:
-[ 0] Char e: 15.272749345364689 %
-[ 1] Char r: 8.48482659847805 %
-[ 2] Char n: 7.695652975670924 %
-[ 3] Char t: 6.977014002434137 %
-[ 4] Char a: 6.780469739252302 %
-[ 5] Char i: 6.164636170291236 %
-[ 6] Char s: 6.0942551909814 %
-[ 7] Char d: 5.953493232361728 %
-[ 8] Char l: 5.076650725938311 %
-[ 9] Char o: 4.883026197706011 %
-[10] Char g: 4.012253666572415 %
-[11] Char k: 3.232607599916403 %
-[12] Char m: 3.0863135119186653 %
-[13] Char f: 2.701600014752345 %
-[14] Char v: 2.13970470722742 %
-[15] Char b: 1.982423195603801 %
-[16] Char u: 1.8339777239590376 %
-[17] Char p: 1.5789619266562582 %
-[18] Char h: 1.3433085821767086 %
-[19] Char ø: 0.8730775850411222 %
-[20] Char y: 0.859938777768216 %
-[21] Char å: 0.7699648402443973 %
-[22] Char æ: 0.7208671920140639 %
-[23] Char j: 0.644108896893402 %
-[24] Char c: 0.5698093259407694 %
-[25] Char w: 0.11087309295206717 %
-[26] Char z: 0.05309307500338075 %
-[27] Char x: 0.032424424965885205 %
-[28] Char é: 0.032193919575132464 %
-[29] Char q: 0.012139950579644223 %
+First 31 characters:
+[ 0] Char e: 15.035728968852169 %
+[ 1] Char r: 8.617892545534042 %
+[ 2] Char n: 7.618264518360449 %
+[ 3] Char t: 6.856503324915001 %
+[ 4] Char a: 6.475133290262796 %
+[ 5] Char i: 6.3714377076032545 %
+[ 6] Char s: 6.279488635251278 %
+[ 7] Char d: 5.919523352715076 %
+[ 8] Char l: 5.094722553952375 %
+[ 9] Char o: 4.86860223053179 %
+[10] Char g: 3.8343872563414845 %
+[11] Char k: 3.3303314473658454 %
+[12] Char m: 3.2096034247603384 %
+[13] Char f: 2.608247355403721 %
+[14] Char v: 2.342188896937424 %
+[15] Char u: 1.9602967951604378 %
+[16] Char b: 1.9047619047619049 %
+[17] Char p: 1.5793183108518178 %
+[18] Char h: 1.45728512043436 %
+[19] Char ø: 0.8954103773892728 %
+[20] Char æ: 0.7449897870618715 %
+[21] Char å: 0.7295235484902471 %
+[22] Char y: 0.6777736447463732 %
+[23] Char j: 0.666418684782396 %
+[24] Char c: 0.5946344551250677 %
+[25] Char w: 0.12248999915164091 %
+[26] Char z: 0.06571519933175407 %
+[27] Char x: 0.045354581465311905 %
+[28] Char é: 0.021926819240783886 %
+[29] Char ó: 0.009592983417842949 %
+[30] Char q: 0.009397208246050236 %
-The first 30 characters have an accumulated ratio of 0.9997241618823994.
+The first 31 characters have an accumulated ratio of 0.9994694492844417.
-964 sequences found.
+1065 sequences found.
-First 512 (typical positive ratio): 0.9968082796759031
-Next 512 (512-1024): 7.68351302509128e-07
-Rest: 3.903127820947816e-17
+First 512 (typical positive ratio): 0.9958348814328518
+Next 512 (512-1024): 2.6103356239028435e-06
+Rest: 3.268948339453948e-05
-- Processing end: 2016-02-19 17:56:42.304278
+- Processing end: 2022-11-30 19:41:17.605842
diff --git a/script/charsets/ibm865.py b/script/charsets/ibm865.py
new file mode 100644
index 0000000..7fc3122
--- /dev/null
+++ b/script/charsets/ibm865.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'IBM865'
+aliases = ['CP865', '865', 'CSIBM865']
+
+language = \
+{
+ 'complete': [ 'no', 'da' ],
+ 'incomplete': []
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,SYM,SYM, # 9X
+ LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
+]
diff --git a/script/langs/da.py b/script/langs/da.py
index 18d2379..31226a0 100644
--- a/script/langs/da.py
+++ b/script/langs/da.py
@@ -50,7 +50,7 @@ code = 'da'
# ASCII characters are also used in French.
use_ascii = True
# The charsets we want to support and create data for.
-charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
+charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'IBM865' ]
## Optional Properties ##
diff --git a/script/langs/no.py b/script/langs/no.py
new file mode 100644
index 0000000..93cf23f
--- /dev/null
+++ b/script/langs/no.py
@@ -0,0 +1,55 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+## Mandatory Properties ##
+
+name = 'Norwegian'
+code = 'no'
+use_ascii = True
+charsets = ['IBM865', 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'æøåéìîàêÆØÅ'
+# Some pages that should contain most norwegian-norwegian norwegian
+start_pages = ['Norsk', 'Saft', 'Hund']
+wikipedia_code = code
+case_mapping = True
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 61e315f..8128062 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -36,6 +36,7 @@ set(
LangModels/LangThaiModel.cpp
LangModels/LangTurkishModel.cpp
LangModels/LangVietnameseModel.cpp
+ LangModels/LangNorwegianModel.cpp
nsHebrewProber.cpp
nsCharSetProber.cpp
nsBig5Prober.cpp
@@ -84,6 +85,12 @@ if(BUILD_SHARED_LIBS)
target_compile_definitions("${UCHARDET_LIBRARY}" PUBLIC UCHARDET_SHARED)
endif()
+target_include_directories(${UCHARDET_LIBRARY}
+ PUBLIC
+ "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
+ "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PACKAGE_NAME}>"
+)
+
if (UCHARDET_STATIC_LIBRARY)
add_library(
${UCHARDET_STATIC_LIBRARY}
@@ -91,6 +98,12 @@ if (UCHARDET_STATIC_LIBRARY)
${UCHARDET_SOURCES}
)
target_compile_definitions("${UCHARDET_STATIC_LIBRARY}" PRIVATE BUILDING_UCHARDET)
+
+ target_include_directories(${UCHARDET_STATIC_LIBRARY}
+ PUBLIC
+ "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
+ "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PACKAGE_NAME}>"
+ )
endif (UCHARDET_STATIC_LIBRARY)
set_target_properties(
@@ -121,6 +134,8 @@ if (NOT WIN32)
install(
TARGETS
${UCHARDET_LIBRARY}
+ EXPORT
+ UchardetTargets
LIBRARY DESTINATION
${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION
@@ -130,6 +145,8 @@ else (NOT WIN32)
install(
TARGETS
${UCHARDET_LIBRARY}
+ EXPORT
+ UchardetTargets
RUNTIME DESTINATION
${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION
@@ -141,6 +158,8 @@ if (UCHARDET_STATIC_LIBRARY)
install(
TARGETS
${UCHARDET_STATIC_LIBRARY}
+ EXPORT
+ UchardetTargets
ARCHIVE DESTINATION
${CMAKE_INSTALL_LIBDIR}
)
diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp
index c60e7b2..152dddb 100644
--- a/src/LangModels/LangDanishModel.cpp
+++ b/src/LangModels/LangDanishModel.cpp
@@ -41,7 +41,7 @@
/**
* Generated by BuildLangModel.py
- * On: 2016-02-19 17:56:42.163975
+ * On: 2022-11-30 19:41:17.519380
**/
/* Character Mapping Table:
@@ -67,18 +67,18 @@ static const unsigned char Iso_8859_15_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
- SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM, 53, 42,SYM,SYM, 54,SYM,SYM,SYM, 55, 56, 57,SYM, /* BX */
- 58, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 59, 34, 60, 50, /* CX */
- 43, 47, 51, 36, 52, 61, 30,SYM, 19, 62, 37, 44, 31, 46, 63, 48, /* DX */
- 64, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 65, 34, 66, 50, /* EX */
- 43, 47, 51, 36, 52, 67, 30,SYM, 19, 68, 37, 44, 31, 46, 69, 70, /* FX */
+ SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 50,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM, 60, 57,SYM,SYM, 61,SYM,SYM,SYM, 43, 43, 62,SYM, /* BX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 63, 46, 64, 35, 47, 52, /* CX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 65, 37, 66, 33, 40, 55, 41, /* DX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 67, 46, 68, 35, 47, 52, /* EX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 69, 37, 70, 33, 40, 55, 71, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -88,18 +88,18 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
- 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */
- 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */
- 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */
- 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 72, 46, 73, 35, 47, 52, /* CX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 74, 37, 75, 33, 40, 55, 41, /* DX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 76, 46, 77, 35, 47, 52, /* EX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 78, 37, 79, 33, 40, 55, 80, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -109,61 +109,83 @@ static const unsigned char Windows_1252_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
- 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
- SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 85,ILL, 86,ILL, /* 8X */
- ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 87,ILL, 88, 89, /* 9X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM,ILL,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 43,ILL, 82,ILL, /* 8X */
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 43,ILL, 83, 84, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
- 90, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 91, 34, 92, 50, /* CX */
- 43, 47, 51, 36, 52, 93, 30,SYM, 19, 94, 37, 44, 31, 46, 95, 48, /* DX */
- 96, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 97, 34, 98, 50, /* EX */
- 43, 47, 51, 36, 52, 99, 30,SYM, 19,100, 37, 44, 31, 46,101,102, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 85, 46, 86, 35, 47, 52, /* CX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 87, 37, 88, 33, 40, 55, 41, /* DX */
+ 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 89, 46, 90, 35, 47, 52, /* EX */
+ 31, 48, 58, 29, 49, 59, 34,SYM, 19, 91, 37, 92, 33, 40, 55, 93, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Ibm865_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 42, 33, 28, 44, 36, 39, 21, 42, 94, 46, 38, 52, 47, 95, 36, 21, /* 8X */
+ 28, 20, 20, 49, 34, 58, 96, 97, 98, 34, 33, 19,SYM, 19,SYM,SYM, /* 9X */
+ 32, 35, 29, 37, 48, 48,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
+ 45, 41, 99, 56,100,101, 57, 54,102,103,104,105,106,107, 51,SYM, /* EX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
/* Model Table:
- * Total sequences: 964
- * First 512 sequences: 0.9968082796759031
- * Next 512 sequences (512-1024): 0.0031917203240968304
- * Rest: 3.903127820947816e-17
+ * Total sequences: 1065
+ * First 512 sequences: 0.9958348814328518
+ * Next 512 sequences (512-1024): 0.0041324290837536455
+ * Rest: 3.268948339453948e-05
* Negative sequences: TODO
*/
static const PRUint8 DanishLangModel[] =
{
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,3,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,0,0,
- 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,
- 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,2,3,2,3,3,3,3,3,2,2,2,2,2,0,0,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,0,
- 3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,0,
- 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,2,2,3,3,3,2,2,0,0,2,0,
- 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,2,0,0,2,0,
- 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,2,3,0,2,2,3,2,3,3,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,2,0,2,0,2,0,
- 3,3,3,3,3,3,2,2,3,3,2,2,3,2,3,2,3,2,2,3,3,3,3,3,2,3,2,2,2,0,
- 3,3,3,3,2,2,3,3,3,2,3,3,3,2,3,3,0,2,2,2,2,0,0,3,0,0,2,0,0,0,
- 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,0,0,0,2,2,2,0,0,0,
- 3,3,3,3,2,0,3,3,3,2,3,3,2,2,3,3,0,2,2,2,0,0,0,0,0,0,0,0,0,0,
- 2,3,3,3,0,3,3,3,3,2,3,3,3,3,3,3,2,2,2,0,0,0,0,0,2,0,0,0,0,0,
- 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,3,3,2,3,2,2,0,0,0,0,0,
- 3,3,2,3,3,3,2,2,3,3,2,3,2,2,0,2,3,2,3,0,3,0,0,2,3,2,2,0,2,2,
- 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,0,0,2,2,2,0,0,0,
- 3,2,2,2,3,3,2,2,0,3,0,2,2,0,0,2,2,2,2,2,2,0,0,2,2,0,2,0,0,0,
- 3,2,0,2,2,3,2,0,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,2,2,0,0,2,0,
- 2,3,2,2,2,0,2,2,2,2,2,2,2,0,2,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,
- 0,0,0,0,3,2,2,2,2,2,0,0,0,0,2,2,3,0,2,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,
+ 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,2,2,3,3,3,3,3,3,2,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,1,2,3,3,3,3,2,2,2,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,0,0,2,0,0,
+ 3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,2,2,3,3,3,3,3,3,1,2,2,1,2,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,0,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,2,2,2,3,3,3,2,3,2,0,0,0,1,1,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,2,2,0,2,2,3,2,2,3,0,0,2,
+ 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,0,2,0,2,2,0,
+ 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,3,3,3,3,2,2,0,1,0,2,2,0,
+ 3,3,3,3,3,3,3,2,3,3,1,2,3,2,3,3,2,2,2,3,3,3,3,3,2,3,0,0,2,2,1,
+ 3,3,3,3,0,2,3,3,3,2,3,3,3,2,3,2,3,2,2,0,0,0,2,3,0,2,1,0,0,0,0,
+ 2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,0,2,0,0,0,0,0,0,
+ 3,3,3,3,0,0,3,3,3,2,2,3,2,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,3,2,0,0,2,2,2,2,2,0,0,0,0,
+ 3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,3,2,2,2,3,3,2,3,0,2,0,0,0,0,2,0,
+ 3,3,2,3,3,3,2,2,3,3,2,3,2,2,2,3,2,2,3,0,2,0,3,2,3,0,2,2,2,2,2,
+ 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,2,2,2,2,2,0,0,0,2,
+ 3,2,2,2,3,3,2,2,2,3,2,2,2,2,0,2,2,2,2,0,0,0,2,2,0,2,3,0,0,0,0,
+ 3,2,1,2,2,2,2,2,2,2,0,2,1,2,2,0,0,2,0,0,0,0,2,0,2,2,0,2,0,0,0,
+ 2,2,3,2,2,0,2,2,2,2,2,0,2,2,2,2,2,1,2,0,0,0,0,0,1,0,2,0,0,0,0,
+ 0,3,2,2,2,0,2,0,2,0,2,2,0,2,2,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,
+ 0,0,0,0,2,1,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
@@ -171,8 +193,8 @@ const SequenceModel Iso_8859_15DanishModel =
{
Iso_8859_15_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9968082796759031,
+ 31,
+ (float)0.9958348814328518,
PR_TRUE,
"ISO-8859-15"
};
@@ -181,8 +203,8 @@ const SequenceModel Iso_8859_1DanishModel =
{
Iso_8859_1_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9968082796759031,
+ 31,
+ (float)0.9958348814328518,
PR_TRUE,
"ISO-8859-1"
};
@@ -191,8 +213,18 @@ const SequenceModel Windows_1252DanishModel =
{
Windows_1252_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9968082796759031,
+ 31,
+ (float)0.9958348814328518,
PR_TRUE,
"WINDOWS-1252"
};
+
+const SequenceModel Ibm865DanishModel =
+{
+ Ibm865_CharToOrderMap,
+ DanishLangModel,
+ 31,
+ (float)0.9958348814328518,
+ PR_TRUE,
+ "IBM865"
+};
diff --git a/src/LangModels/LangNorwegianModel.cpp b/src/LangModels/LangNorwegianModel.cpp
new file mode 100644
index 0000000..f3a876d
--- /dev/null
+++ b/src/LangModels/LangNorwegianModel.cpp
@@ -0,0 +1,323 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+
+/********* Language model for: Norwegian *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-01-28 21:58:11.143599
+ **/
+
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const unsigned char Ibm865_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 43, 32, 28, 50, 31, 45, 19, 43, 53, 42, 41, 57, 61, 58, 31, 19, /* 8X */
+ 28, 24, 24, 37, 30, 54, 63, 59, 64, 30, 32, 21,SYM, 21,SYM,SYM, /* 9X */
+ 36, 33, 35, 40, 44, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
+ 48, 46, 65, 66, 60, 60, 67, 62, 68, 69, 70, 71, 72, 73, 52,SYM, /* EX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Iso_8859_15_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 47,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM, 49, 74,SYM,SYM, 49,SYM,SYM,SYM, 51, 51, 75,SYM, /* BX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 76, 32, 39, 38, 46, /* DX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 77, 32, 39, 38, 78, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Iso_8859_1_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 79,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 80, 32, 39, 38, 46, /* DX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 81, 32, 39, 38, 82, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Windows_1252_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */
+ 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM,ILL,SYM, 83,SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 51,ILL, 49,ILL, /* 8X */
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 51,ILL, 49, 84, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 85,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 86, 32, 39, 38, 46, /* DX */
+ 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */
+ 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 87, 32, 39, 38, 88, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+
+/* Model Table:
+ * Total sequences: 991
+ * First 512 sequences: 0.9975864274305254
+ * Next 512 sequences (512-1024): 0.002413572569474574
+ * Rest: 3.5128150388530344e-17
+ * Negative sequences: TODO
+ */
+static const PRUint8 NorwegianLangModel[] =
+{
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,0,2,0,
+ 0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,
+ 2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,
+ 2,2,2,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,0,
+ 2,2,2,0,2,0,0,0,2,0,0,2,0,0,2,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2,
+ 2,2,0,0,0,2,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,0,3,3,3,0,2,0,
+ 0,0,2,2,0,0,0,0,0,2,0,2,0,2,0,2,2,0,2,0,0,0,0,0,0,0,2,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,2,3,2,2,2,0,
+ 0,0,0,2,2,0,0,0,0,0,2,2,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2,
+ 2,2,2,0,2,2,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,2,3,0,3,2,3,0,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0,2,
+ 0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,0,0,0,2,
+ 0,0,0,0,2,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,2,2,0,0,2,2,2,
+ 2,2,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,2,0,2,2,2,
+ 2,2,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,2,0,0,2,0,0,
+ 2,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,0,3,2,2,3,3,3,3,3,3,0,0,0,2,0,2,
+ 0,2,0,0,2,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,0,2,0,0,
+ 2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,2,2,2,3,2,2,3,2,2,2,0,
+ 0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,2,3,3,3,3,3,2,2,2,2,0,2,2,3,3,2,3,3,3,3,2,3,2,2,0,2,0,2,
+ 2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,2,3,2,2,3,3,3,3,2,3,2,2,0,2,0,2,
+ 2,2,2,0,2,2,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,2,3,2,3,3,3,2,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,0,2,3,2,2,2,2,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,0,2,3,2,3,3,3,3,3,3,3,0,3,2,0,3,2,2,2,0,2,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,2,2,2,2,3,3,2,3,2,2,2,2,0,2,0,3,0,0,2,2,3,2,0,3,0,0,0,0,0,2,
+ 2,2,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,3,3,3,3,3,2,3,0,3,2,0,2,3,2,3,0,3,0,0,3,2,0,2,0,2,2,0,
+ 0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
+ 0,3,3,3,3,0,2,2,0,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,2,2,2,3,3,3,2,3,2,2,0,2,0,2,2,2,2,3,0,2,2,2,2,0,2,0,0,0,0,0,
+ 2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,2,2,2,2,3,3,2,3,2,2,2,2,0,2,0,2,2,2,0,3,0,0,2,0,2,2,0,0,0,0,
+ 0,2,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,0,2,2,2,3,3,2,2,0,0,0,0,2,2,2,2,2,2,0,2,0,0,0,0,0,0,2,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,2,3,2,2,2,0,2,2,2,0,2,2,2,2,0,0,2,2,0,0,0,0,2,0,0,2,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,2,0,0,2,2,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,2,2,0,0,2,0,2,2,2,2,2,2,2,0,0,2,0,2,0,0,2,0,0,0,0,0,0,2,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,2,2,0,0,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,2,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,2,2,0,2,0,0,2,0,2,0,2,0,0,2,0,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,0,2,2,0,2,0,0,2,2,2,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,2,2,0,0,2,2,2,0,0,0,0,2,0,0,0,2,2,0,0,0,0,0,0,0,2,0,0,0,0,0,
+ 0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,2,2,0,0,2,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,2,2,0,0,2,0,0,2,2,2,2,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
+ 0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,2,2,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,2,0,2,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+
+const SequenceModel Ibm865NorwegianModel =
+{
+ Ibm865_CharToOrderMap,
+ NorwegianLangModel,
+ 62,
+ (float)0.9975864274305254,
+ PR_TRUE,
+ "IBM865"
+};
+
+const SequenceModel Iso_8859_15NorwegianModel =
+{
+ Iso_8859_15_CharToOrderMap,
+ NorwegianLangModel,
+ 62,
+ (float)0.9975864274305254,
+ PR_TRUE,
+ "ISO-8859-15"
+};
+
+const SequenceModel Iso_8859_1NorwegianModel =
+{
+ Iso_8859_1_CharToOrderMap,
+ NorwegianLangModel,
+ 62,
+ (float)0.9975864274305254,
+ PR_TRUE,
+ "ISO-8859-1"
+};
+
+const SequenceModel Windows_1252NorwegianModel =
+{
+ Windows_1252_CharToOrderMap,
+ NorwegianLangModel,
+ 62,
+ (float)0.9975864274305254,
+ PR_TRUE,
+ "WINDOWS-1252"
+};
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index f956d25..68205d5 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -110,86 +110,92 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[32] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel);
mProbers[33] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel);
mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel);
-
- mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel);
- mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel);
- mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel);
-
- mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel);
- mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel);
- mProbers[40] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel);
-
- mProbers[41] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
- mProbers[42] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
- mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
- mProbers[44] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
-
- mProbers[45] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel);
-
- mProbers[46] = new nsSingleByteCharSetProber(&Windows_1250CzechModel);
- mProbers[47] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel);
- mProbers[48] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel);
- mProbers[49] = new nsSingleByteCharSetProber(&Ibm852CzechModel);
-
- mProbers[50] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel);
- mProbers[51] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel);
- mProbers[52] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel);
- mProbers[53] = new nsSingleByteCharSetProber(&Ibm852SlovakModel);
-
- mProbers[54] = new nsSingleByteCharSetProber(&Windows_1250PolishModel);
- mProbers[55] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel);
- mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel);
- mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel);
- mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
- mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
-
- mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
- mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
- mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
- mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
- mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
- mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
-
- mProbers[66] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel);
- mProbers[67] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel);
- mProbers[68] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel);
- mProbers[69] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel);
- mProbers[70] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
-
- mProbers[71] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel);
- mProbers[72] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
- mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel);
- mProbers[74] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel);
- mProbers[75] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel);
- mProbers[76] = new nsSingleByteCharSetProber(&Ibm852CroatianModel);
-
- mProbers[77] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel);
- mProbers[78] = new nsSingleByteCharSetProber(&Windows_1257EstonianModel);
- mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel);
- mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
- mProbers[81] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
-
- mProbers[82] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel);
- mProbers[83] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel);
- mProbers[84] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel);
- mProbers[85] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
-
- mProbers[86] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel);
- mProbers[87] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel);
- mProbers[88] = new nsSingleByteCharSetProber(&Iso_8859_16RomanianModel);
- mProbers[89] = new nsSingleByteCharSetProber(&Ibm852RomanianModel);
-
- mProbers[90] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel);
- mProbers[91] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel);
- mProbers[92] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel);
- mProbers[93] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel);
- mProbers[94] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);
-
- mProbers[95] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);
- mProbers[96] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel);
- mProbers[97] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel);
- mProbers[98] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel);
- mProbers[99] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
+ mProbers[35] = new nsSingleByteCharSetProber(&Ibm865DanishModel);
+
+ mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel);
+ mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel);
+ mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel);
+
+ mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel);
+ mProbers[40] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel);
+ mProbers[41] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel);
+
+ mProbers[42] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
+ mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
+ mProbers[44] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
+ mProbers[45] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
+
+ mProbers[46] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel);
+
+ mProbers[47] = new nsSingleByteCharSetProber(&Windows_1250CzechModel);
+ mProbers[48] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel);
+ mProbers[49] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel);
+ mProbers[50] = new nsSingleByteCharSetProber(&Ibm852CzechModel);
+
+ mProbers[51] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel);
+ mProbers[52] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel);
+ mProbers[53] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel);
+ mProbers[54] = new nsSingleByteCharSetProber(&Ibm852SlovakModel);
+
+ mProbers[55] = new nsSingleByteCharSetProber(&Windows_1250PolishModel);
+ mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel);
+ mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel);
+ mProbers[58] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel);
+ mProbers[59] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
+ mProbers[60] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
+
+ mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
+ mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
+ mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
+ mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
+ mProbers[65] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
+ mProbers[66] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
+
+ mProbers[67] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel);
+ mProbers[68] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel);
+ mProbers[69] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel);
+ mProbers[70] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel);
+ mProbers[71] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel);
+
+ mProbers[72] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel);
+ mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
+ mProbers[74] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel);
+ mProbers[75] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel);
+ mProbers[76] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel);
+ mProbers[77] = new nsSingleByteCharSetProber(&Ibm852CroatianModel);
+
+ mProbers[78] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel);
+ mProbers[79] = new nsSingleByteCharSetProber(&Windows_1257EstonianModel);
+ mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel);
+ mProbers[81] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
+ mProbers[82] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
+
+ mProbers[83] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel);
+ mProbers[84] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel);
+ mProbers[85] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel);
+ mProbers[86] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
+
+ mProbers[87] = new nsSingleByteCharSetProber(&Windows_1250RomanianModel);
+ mProbers[88] = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel);
+ mProbers[89] = new nsSingleByteCharSetProber(&Iso_8859_16RomanianModel);
+ mProbers[90] = new nsSingleByteCharSetProber(&Ibm852RomanianModel);
+
+ mProbers[91] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel);
+ mProbers[92] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel);
+ mProbers[93] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel);
+ mProbers[94] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel);
+ mProbers[95] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);
+
+ mProbers[96] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);
+ mProbers[97] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel);
+ mProbers[98] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel);
+ mProbers[99] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel);
+ mProbers[100] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel);
+
+ mProbers[101] = new nsSingleByteCharSetProber(&Iso_8859_15NorwegianModel);
+ mProbers[102] = new nsSingleByteCharSetProber(&Iso_8859_1NorwegianModel);
+ mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
+ mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
Reset();
}
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index ec72324..2401fff 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
-#define NUM_OF_SBCS_PROBERS 100
+#define NUM_OF_SBCS_PROBERS 105
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index 42d21b2..f55528c 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -171,6 +171,7 @@ extern const SequenceModel Windows_1258VietnameseModel;
extern const SequenceModel Iso_8859_15DanishModel;
extern const SequenceModel Iso_8859_1DanishModel;
extern const SequenceModel Windows_1252DanishModel;
+extern const SequenceModel Ibm865DanishModel;
extern const SequenceModel Iso_8859_13LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
@@ -252,5 +253,10 @@ extern const SequenceModel Iso_8859_9SwedishModel;
extern const SequenceModel Iso_8859_15SwedishModel;
extern const SequenceModel Windows_1252SwedishModel;
+extern const SequenceModel Iso_8859_15NorwegianModel;
+extern const SequenceModel Iso_8859_1NorwegianModel;
+extern const SequenceModel Windows_1252NorwegianModel;
+extern const SequenceModel Ibm865NorwegianModel;
+
#endif /* nsSingleByteCharSetProber_h__ */
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index fa15eb2..7afad1d 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -3,6 +3,18 @@ set(
uchardet.cpp
)
+include(CheckSymbolExists)
+
+check_symbol_exists(getopt_long "getopt.h" HAVE_GETOPT_LONG)
+
+# On Windows with MSVC, `getopt_long` is not available by default.
+# But some third-party libraries can be used. For example, in `vcpkg`,
+# we can find a port named `getopt-win32`.
+if (NOT HAVE_GETOPT_LONG)
+ find_path(GETOPT_INCLUDE_DIR NAMES getopt.h)
+ find_library(GETOPT_LIBRARY NAMES getopt)
+endif (NOT HAVE_GETOPT_LONG)
+
set(UCHARDET_BINARY uchardet)
add_executable(
@@ -10,6 +22,11 @@ add_executable(
${UCHARDET_SOURCES}
)
+if (GETOPT_INCLUDE_DIR AND GETOPT_LIBRARY)
+ target_include_directories(${UCHARDET_BINARY} PRIVATE ${GETOPT_INCLUDE_DIR})
+ target_link_libraries(${UCHARDET_BINARY} PRIVATE ${GETOPT_LIBRARY})
+endif (GETOPT_INCLUDE_DIR AND GETOPT_LIBRARY)
+
target_link_libraries(
${UCHARDET_BINARY}
${UCHARDET_LIBRARY}
@@ -18,6 +35,8 @@ target_link_libraries(
install(
TARGETS
${UCHARDET_BINARY}
+ EXPORT
+ UchardetTargets
RUNTIME DESTINATION
${CMAKE_INSTALL_BINDIR}
)
diff --git a/src/tools/uchardet.cpp b/src/tools/uchardet.cpp
index c6bf5f4..af25acf 100644
--- a/src/tools/uchardet.cpp
+++ b/src/tools/uchardet.cpp
@@ -35,12 +35,9 @@
*
* ***** END LICENSE BLOCK ***** */
#include "../uchardet.h"
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
#include <getopt.h>
-#include <iostream>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#ifndef VERSION
@@ -48,15 +45,17 @@
#endif
#define BUFFER_SIZE 65536
-char buffer[BUFFER_SIZE];
+static char buffer[BUFFER_SIZE];
-void detect(FILE * fp)
+static void detect(FILE * fp)
{
uchardet_t handle = uchardet_new();
- while (!feof(fp))
+ while (1)
{
size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
+ if (len == 0)
+ break;
int retval = uchardet_handle_data(handle, buffer, len);
if (retval != 0)
{
@@ -75,7 +74,7 @@ void detect(FILE * fp)
uchardet_delete(handle);
}
-void show_version()
+static void show_version()
{
printf("\n");
printf("uchardet Command Line Tool\n");
@@ -86,7 +85,7 @@ void show_version()
printf("\n");
}
-void show_usage()
+static void show_usage()
{
show_version();
printf("Usage:\n");
diff --git a/test/da/ibm865.txt b/test/da/ibm865.txt
new file mode 100644
index 0000000..25034f3
--- /dev/null
+++ b/test/da/ibm865.txt
@@ -0,0 +1,5 @@
+Jimi Hendrix (1942-1970) var en amerikansk rockguitarist, sanger og sangskriver.
+
+Han begyndte at spille guitar, da han var femten †r, og efter at have spillet med blandt andet Little Richard dannede han Jimi Hendrix Experience i slutningen af 1966. Denne gruppe fik snart hits med sange som "Hey Joe" og "Purple Haze", og med det tredje album, Electric Ladyland fra 1968, fik gruppen sit store gennembrud. Med flere markante optr‘dener p† tidens store festivaler, heriblandt Woodstock, opn†ede han legendarisk status i rockmusikken, allerede mens han var i live.
+
+Hendrix brugte sin elektriske guitar som elektronisk lydkilde og eksperimenterede med feedback og distortion med udgangspunkt i traditionel rock'n'roll og blues. Hans misbrug af alkohol og narkotika f›rte imidlertid til, at han ›delagde sig selv, og han d›de som blot 27-†rig efter indtagelse af sovepiller.
diff --git a/test/mt/iso-8859-3.txt b/test/mt/iso-8859-3.txt
index 255269b..d98884a 100644
--- a/test/mt/iso-8859-3.txt
+++ b/test/mt/iso-8859-3.txt
@@ -1,4 +1 @@
-Franza (Franåi¿:France), uffiåjalment ir-Repubblika Franåi¿a (Franåi¿:
-République française), hi pajji¿ fl-Ewropa tal-Punent. Il-belt belt kapitali
-tag±ha hi Pariõi. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 régions
-li huma suddivi¿i f' départements.
+Il-Malti huwa l-ilsien nazzjonali tar-Repubblika ta' Malta. Huwa l-ilsien uffiåjali flimkien mal-Ingli¿; kif ukoll wie±ed mill-ilsna uffiåjali tal-Unjoni Ewropea. Dan l-ilsien g±andu sisien u g±erq semitiku, ta' djalett G±arbi li õej mit-Tramuntana tal-Afrika, g±al±ekk qatt ma kellu rabta mill-qrib mal-G±arbi Klassiku. I¿da tul i¿-¿minijiet, min±abba proåess tal-Latinizzazzjoni ta' Malta, bdew de±lin bosta elementi lingwistiåi mill-Isqalli, djalett ta' art li wkoll g±addiet minn ¿mien ta' ±akma G±arbija. Wara l-Isqalli beda die±el ukoll it-Taljan, fuq kollox fi¿-¿mien tad-da±la tal-Kavallieri tal-Ordni ta' San Õwann sa meta l-Ingli¿ ±a post it-Taljan b±ala l-ilsien uffiåjali fil-Kostituzzjoni Kolonjali tal-1934. Il-Malti huwa l-ilsien wa±dieni ta' g±ajn semitika li jinkiteb b'ittri Latini. L-alfabett Malti mag±mul minn 30 ittra (24 konsonanti u 6 vokali) li jidhru f'din l-ordni:
diff --git a/test/no/ibm865.txt b/test/no/ibm865.txt
new file mode 100644
index 0000000..9fcbc9a
--- /dev/null
+++ b/test/no/ibm865.txt
@@ -0,0 +1,17 @@
+Pangramer brukes som ren underholdning; som skriveeksempel for pr›ve p†
+h†ndskrift; som hjelpemiddel til † vise en font; eller som huskeregel for †
+raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver.
+
+
+S‘r golfer med k›lle vant sexquiz p† wc i hjemby.
+H›vdingens kj‘re squaw f†r litt pizza i Mexico by.
+V†r kj‘re m›y i cape ›vde banjo, whist og quiz i taxifila.
+IQ-l›s WC-boms uten h›rsel skj‘rer god pizza p† xylofon.
+V†r kj‘re zulu-m›y ›vde banjo, whist og quickstep fra taxi.
+Etter quiz og whist m† Jo b‘re fakkellys p† v†r s›rg†ende cox.
+Taxisj†f›ren quizet bedre om calypso, watt og kl‘r p† hjemveien.
+V†r s‘re Zulu fra bade›ya spilte jo whist og quickstep i min taxi.
+Du †t ca fire wienerp›lser og tok taxi hjem fra byen med ‘re fra quizen.
+Jeg begynte † fort‘re en sandwich mens jeg kj›rte taxi p† vei til quiz.
+Quisling var ein kl›ppar til † spela jazz p† xylofon, men l‘rte seg aldri † spela cembalo f›r han drog til Washington.
+H›vdingens kj‘re squaw f†r litt pizza i Mexico by.
diff --git a/test/no/iso-8859-1.txt b/test/no/iso-8859-1.txt
new file mode 100644
index 0000000..95262c6
--- /dev/null
+++ b/test/no/iso-8859-1.txt
@@ -0,0 +1,20 @@
+Pangramer brukes som ren underholdning; som skriveeksempel for prøve på
+håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å
+raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver.
+
+
+Sær golfer med kølle vant sexquiz på wc i hjemby.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+Vår kjære møy i cape øvde banjo, whist og quiz i taxifila.
+IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon.
+Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi.
+Etter quiz og whist må Jo bære fakkellys på vår sørgående cox.
+Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien.
+Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi.
+Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen.
+Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz.
+Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+
+Et sted, cirka ¾ inn i John Greens siste roman, Skilpadder hele veien ned,
+begynte jeg og romanens forteller, Aza Holmes, å gråte helt samtidig
diff --git a/test/no/iso-8859-15.txt b/test/no/iso-8859-15.txt
new file mode 100644
index 0000000..4571f52
--- /dev/null
+++ b/test/no/iso-8859-15.txt
@@ -0,0 +1,21 @@
+Pangramer brukes som ren underholdning; som skriveeksempel for prøve på
+håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å
+raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver.
+
+
+Sær golfer med kølle vant sexquiz på wc i hjemby.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+Vår kjære møy i cape øvde banjo, whist og quiz i taxifila.
+IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon.
+Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi.
+Etter quiz og whist må Jo bære fakkellys på vår sørgående cox.
+Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien.
+Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi.
+Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen.
+Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz.
+Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+
+Euro (symbol: ¤) er den Den europeiske unions myntenhet. Den
+er innført i 19 av unionens 27 medlemsland (kjent som eurosonen) og i fire
+mikrostater og noen andre land og områder.
diff --git a/test/no/utf-8.txt b/test/no/utf-8.txt
new file mode 100644
index 0000000..e5f5e70
--- /dev/null
+++ b/test/no/utf-8.txt
@@ -0,0 +1,20 @@
+Pangramer brukes som ren underholdning; som skriveeksempel for prøve på
+håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å
+raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver.
+
+
+Sær golfer med kølle vant sexquiz på wc i hjemby.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+Vår kjære møy i cape øvde banjo, whist og quiz i taxifila.
+IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon.
+Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi.
+Etter quiz og whist må Jo bære fakkellys på vår sørgående cox.
+Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien.
+Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi.
+Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen.
+Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz.
+Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+
+Et sted, cirka ¾ inn i John Greens siste roman, Skilpadder hele veien ned,
+begynte jeg og romanens forteller, Aza Holmes, å gråte helt samtidig
diff --git a/test/no/windows-1252.txt b/test/no/windows-1252.txt
new file mode 100644
index 0000000..eca363d
--- /dev/null
+++ b/test/no/windows-1252.txt
@@ -0,0 +1,21 @@
+Pangramer brukes som ren underholdning; som skriveeksempel for prøve på
+håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å
+raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver.
+
+
+Sær golfer med kølle vant sexquiz på wc i hjemby.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+Vår kjære møy i cape øvde banjo, whist og quiz i taxifila.
+IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon.
+Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi.
+Etter quiz og whist må Jo bære fakkellys på vår sørgående cox.
+Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien.
+Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi.
+Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen.
+Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz.
+Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington.
+Høvdingens kjære squaw får litt pizza i Mexico by.
+
+Euro (symbol: € – valutakode: EUR) er den Den europeiske unions myntenhet. Den
+er innført i 19 av unionens 27 medlemsland (kjent som eurosonen) og i fire
+mikrostater og noen andre land og områder.
diff --git a/test/uchardet-tests.c b/test/uchardet-tests.c
index b39c80c..8d6ab38 100644
--- a/test/uchardet-tests.c
+++ b/test/uchardet-tests.c
@@ -52,9 +52,11 @@ detect(FILE *fp)
char buffer[BUFFER_SIZE];
int i;
- while (!feof(fp))
+ while (1)
{
size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
+ if (len == 0)
+ break;
int retval = uchardet_handle_data(handle, buffer, len);
if (retval != 0)
{
@@ -122,6 +124,9 @@ main(int argc, char ** argv)
/* In a unit test, 0 means success, other returned values mean failure. */
success = (strcmp(charset, expected_charset) != 0);
+ if (success) {
+ fprintf(stderr, "Got %s, expected %s\n", charset, expected_charset);
+ }
free(charset);
free(filename);
diff --git a/uchardet-config.cmake.in b/uchardet-config.cmake.in
new file mode 100644
index 0000000..b6759b4
--- /dev/null
+++ b/uchardet-config.cmake.in
@@ -0,0 +1,19 @@
+# This file may optionally do:
+#
+# 1. Check for dependencies of exported targets. Example:
+#
+# include(CMakeFindDependencyMacro)
+# find_dependency(MYDEP REQUIRED)
+#
+# find_dependency() has the same syntax as find_package()
+#
+# 2. Capture values from configuration. Example:
+#
+# set(my-config-var @my-config-var@)
+#
+# 3. Other required setup when importing targets from another project
+#
+# See also:
+# https://cliutils.gitlab.io/modern-cmake/chapters/install.html
+#
+include("${CMAKE_CURRENT_LIST_DIR}/uchardet-targets.cmake")