summaryrefslogtreecommitdiff
path: root/script
diff options
context:
space:
mode:
authorJames Cowgill <jcowgill@debian.org>2023-11-29 19:50:06 +0000
committerJames Cowgill <jcowgill@debian.org>2023-11-29 19:50:06 +0000
commit1a23af3ab4ffcc045291cfa23f80d842edc4877d (patch)
treee96d8c8a4bce7ae5b391d24081b090bed5d1f466 /script
parent8c56f68b6fb386c5df7e2b7c2e2e72d3f392c844 (diff)
parent46fbd223191890161d0409ee542ac6bc22537bf7 (diff)
Update upstream source from tag 'upstream/0.0.8'
Update to upstream version '0.0.8' with Debian dir e343cd97b840fe74c9b2e75b834a8faac2e35ef3
Diffstat (limited to 'script')
-rwxr-xr-xscript/BuildLangModel.py21
-rw-r--r--script/BuildLangModelLogs/LangDanishModel.log384
-rw-r--r--script/charsets/ibm865.py71
-rw-r--r--script/langs/da.py2
-rw-r--r--script/langs/no.py55
5 files changed, 388 insertions, 145 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 38ac793..43f975c 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -50,6 +50,7 @@ import requests
import sys
import re
import os
+import random
# Custom modules.
import charsets.db
@@ -240,12 +241,22 @@ def visit_pages(titles, depth, lang, logfd):
return
next_titles = []
+ if options.max_page is not None:
+ max_titles = int(options.max_page/(options.max_depth * options.max_depth))
+ else:
+ max_titles = sys.maxsize
for title in titles:
if options.max_page is not None and \
len(visited_pages) > options.max_page:
return
if title in visited_pages:
continue
+
+ # Ugly hack skipping internal pages
+ if 'wiki' in title or 'Wiki' in title:
+ print('Skipping', title)
+ continue
+
visited_pages += [title]
try:
page = wikipedia.page(title)
@@ -255,16 +266,22 @@ def visit_pages(titles, depth, lang, logfd):
print("Discarding page {}.\n".format(title))
continue
logfd.write("\n{} (revision {})".format(title, page.revision_id))
+ logfd.flush()
process_text(page.content, lang)
try:
- next_titles += page.links
+ links = page.links
+ random.shuffle(links)
+ if len(links) > max_titles:
+ links = links[:max_titles]
+ next_titles += links
except KeyError:
pass
if depth >= options.max_depth:
return
+ random.shuffle(next_titles)
visit_pages (next_titles, depth + 1, lang, logfd)
language_c = lang.name.replace('-', '_').title()
@@ -277,6 +294,7 @@ logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
if options.max_page is not None:
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
logfd.write('\n\n== Parsed pages ==\n')
+logfd.flush()
try:
visit_pages(lang.start_pages, 0, lang, logfd)
except requests.exceptions.ConnectionError:
@@ -284,6 +302,7 @@ except requests.exceptions.ConnectionError:
exit(1)
logfd.write('\n\n== End of Parsed pages ==')
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
+logfd.flush()
########### CHARACTERS ###########
diff --git a/script/BuildLangModelLogs/LangDanishModel.log b/script/BuildLangModelLogs/LangDanishModel.log
index cf183b3..14bf65d 100644
--- a/script/BuildLangModelLogs/LangDanishModel.log
+++ b/script/BuildLangModelLogs/LangDanishModel.log
@@ -1,158 +1,256 @@
= Logs of language model for Danish (da) =
- Generated by BuildLangModel.py
-- Started: 2016-02-19 17:53:58.564190
-- Maximum depth: 4
-- Max number of pages: 100
+- Started: 2022-11-30 19:37:01.097250
+- Maximum depth: 2
+- Max number of pages: 200
== Parsed pages ==
-Forside (revision 2692411)
-16. februar (revision 6877446)
-17. februar (revision 8454583)
-1878 (revision 8280505)
-19. februar (revision 8206479)
-1922 (revision 8455105)
-1926 (revision 8425271)
-1942 (revision 8443554)
-1945 (revision 8448461)
-1948 (revision 8454392)
-1985 (revision 8409096)
-2. verdenskrig (revision 8433181)
-23. oktober (revision 6877825)
-26. oktober (revision 7849938)
-3C 273 (revision 8443798)
-A-bus (revision 8427319)
-Aktuelle begivenheder (revision 8440596)
-B-52 Stratofortress (revision 8422571)
-Borgerkrigen i Syrien (revision 8447763)
-Boutros Boutros-Ghali (revision 8453935)
-Brasilien (revision 8452750)
-Cusco (region) (revision 7693764)
-Danmark (revision 8451178)
-Danmark i Eurovision Song Contest (revision 8453514)
-Dansk (sprog) (revision 8455750)
-Dansk Melodi Grand Prix 2016 (revision 8452164)
-Dobbeltmordet på Peter Bangs Vej (revision 8334648)
-Encyklopædi (revision 8446641)
-Eritrea-sagen (revision 8452285)
-Eurovision Song Contest 2014 (revision 8445804)
-Eurovision Song Contest 2016 (revision 8453588)
-Flygtningekrisen i Europa 2015 (revision 8452286)
-Fonograf (revision 8177165)
-Formel 1 (revision 8450846)
-Formel 1 2016 (revision 8456463)
-Frederik 6. (revision 8438503)
-Første observation af gravitationsbølger (revision 8451269)
-Grammofon (revision 8375093)
-Guadalcanal (revision 7796248)
-Harper Lee (revision 8456583)
-Hartkorn (revision 8437552)
-IC4 (revision 8446402)
-IC4-sagen (revision 8434463)
-Islamisk Stat (revision 8439228)
-Jonathan Leunbach (revision 8452603)
-Juliane Marie af Braunschweig-Wolfenbüttel (revision 8437957)
-Kaliumklorid (revision 8452216)
-Kejserriget Japan (revision 8044942)
-Kevin Magnussen (revision 8455302)
-København (revision 8427847)
-LIGO (revision 8451266)
-Latinamerika (revision 7692181)
-Leonid Hurwicz (revision 8445727)
-Lighthouse X (revision 8452940)
-Linkoban (revision 8455879)
-Machu Picchu (revision 8406907)
-Matador (tv-serie) (revision 8454648)
-Middelaldercentret (revision 8449194)
-Nobelprisen (revision 8409809)
-Nykøbing Falster (revision 8452825)
-Nyligt afdøde (revision 8456580)
-Overvågning (revision 8455039)
-Panorama (foto) (revision 8448393)
-Peru (revision 8437485)
-Peter Lauritsen (revision 8456097)
-Professor (revision 8415451)
-Renault F1 (revision 8450843)
-S-bus (revision 8455589)
-Salomonøerne (revision 8238961)
-Slaget om Belgien (1940) (revision 8430013)
-Slaget om Guadalcanal (revision 7762887)
-Slaget om Henderson Field (revision 8445480)
-Slaget om Iwo Jima (revision 8145239)
-Soldiers of Love (Lighthouse X-sang) (revision 8452929)
-Solen (revision 8276478)
-Stillehavskrigen (revision 8430649)
-Stockholm (revision 8358042)
-Søslaget ved Guadalcanal (revision 7772812)
-Thomas Edison (revision 8282441)
-Togulykken ved Bad Aibling (revision 8455364)
-Topografi (revision 6886168)
-USA (revision 8448088)
-United States Army (revision 8401635)
-United States Marine Corps (revision 8401667)
-Vestallierede (revision 6961443)
-Wikimedia (revision 8263252)
-Wikipedia (revision 8267051)
-Zikavirus (revision 8454832)
-1. februar (revision 8404985)
-10. februar (revision 6877431)
-11. februar (revision 6877433)
-12. februar (revision 6877437)
-13. februar (revision 6877438)
-14. februar (revision 6877441)
-1497 (revision 7369489)
-15. februar (revision 7329463)
-1560 (revision 7874693)
-1568 (revision 7369703)
-1620 (revision 7423903)
-1688 (revision 7367090)
-18. februar (revision 6877450)
+Forside (revision 10000691)
+Hans Magnus Enzensberger (revision 11341046)
+28. november (revision 9410945)
+Golfkrigen (revision 11144370)
+29. november (revision 6877900)
+8. december (revision 10277754)
+Det Konservative Folkeparti (revision 11313857)
+1990 (revision 11340072)
+1940 (revision 11263756)
+Angolas håndboldlandshold (damer) (revision 11331888)
+Skjoldvulkan (revision 10870812)
+Casper & Mandrilaftalen (revision 11221713)
+26. november (revision 10617630)
+Døde i 2022 (revision 11343986)
+Vikingetidens rustning og våben (revision 11332607)
+Middelaldercentret (revision 11339897)
+Ruslands invasion af Ukraine 2022 (revision 11335164)
+Saddam Hussein (revision 11002258)
+The Jimi Hendrix Experience (revision 10497780)
+Færøerne (revision 11333678)
+27. november (revision 9745974)
+Thomas Vinterberg (revision 11234643)
+Anwar Ibrahim (revision 11342876)
+Mandatområdet i Palæstina (revision 11341286)
+Kunst (revision 11336917)
+Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917)
+Dansk (sprog) (revision 11313509)
+Sergej Sjojgu (revision 11309097)
+Fernando Gomes (revision 11340427)
+Folketinget (revision 11330485)
+15. januar (revision 10515606)
+Rock and Roll Hall of Fame (revision 8408189)
+Thomas Edison (revision 11052704)
+Ukraine (revision 11334630)
+1947 (revision 11252357)
+1937 (revision 11303923)
+IC4 (revision 11317878)
+Jimi Hendrix (revision 11341476)
+Ismail Sabri Yaakob (revision 11105534)
+Okipa-ceremonien (revision 11340589)
+SI-præfiks (revision 11332802)
+Sporvejsmuseet Skjoldenæsholms historie (revision 11338275)
+Irak (revision 11255676)
+Woodstockfestivalen (revision 11226413)
+Nikolaj Lie Kaas (revision 11322663)
+Torben Rechendorff (revision 11342962)
+Folketingsvalget 2022 (revision 11339557)
+Kherson (revision 11314559)
+Keltere (revision 11318773)
+Little Richard (revision 11226619)
+Invasion (revision 10307980)
+Tate Gallery (revision 8312688)
+24. januar (revision 10441562)
+Hans Christian Ægidius (revision 9773029)
+Slaget ved Irpin (1321) (revision 11230064)
+Auschwitz (revision 11310714)
+Jazz fusion (revision 11223082)
+Lutsk (revision 11248429)
+Planetarium (revision 11266837)
+Bibliothèque nationale de France (revision 11055813)
+Digtsamling (revision 10585337)
+Kenneth Gøtterup (revision 11027437)
+Straf (revision 11007456)
+1716 (revision 11339928)
+Kamel (revision 11285016)
+Amnesti (revision 10831621)
+Zulu Royal (revision 10969220)
+Stephen Roche (revision 11239346)
+13. december (revision 10768225)
+Enhed (politisk parti) (revision 10158693)
+The Everly Brothers (revision 10865882)
+3. november (revision 9423371)
+Annelise Gotfredsen (revision 11306090)
+Virtual International Authority File (revision 8702589)
+Europæiske Fællesskab (revision 10868689)
+Væringer (revision 11331002)
+Rom (revision 11341285)
+Decentralisering (revision 11154770)
+Kreml (Moskva) (revision 11045482)
+Folketingsvalget 1994 (revision 11266325)
+28. december (revision 6878014)
+Østjyllands Storkreds (revision 11201505)
+Bruxelles (revision 10802416)
+Erik Haunstrup Clemmensen (revision 10627614)
+Hviderussere (revision 10750673)
+Hvidmelet Gåsefod (revision 11317723)
+Mario Draghi (revision 11302527)
+Folketingsvalget 1968 (revision 11300317)
+Skudår (revision 10360386)
+1921 (revision 11303917)
+Rundkørsel (revision 11103019)
+Valerij Zaluzjnyj (revision 11335164)
+Angrebet på Pearl Harbor (revision 11309782)
+Folketingsmedlemmer valgt i 2007 (revision 11187293)
+Ingeniørvidenskab (revision 9816520)
+Vikinger (revision 11327511)
+Martin Luther King (revision 11320659)
+1757 (revision 11186195)
+Dieseltogsæt (revision 8177984)
+El-værk (revision 11334293)
+Soul (revision 11283982)
+John McVie (revision 11040471)
+Botswanas håndboldlandshold (herrer) (revision 11333322)
+1971 (revision 11243510)
+Rana Hussein (revision 11266594)
+DR (revision 11342995)
+Ewan McGregor (revision 11331681)
+Eliane Paulo (revision 10589121)
+Zepto- (revision 11332802)
+København (revision 11336925)
+Gallien (revision 9984925)
+Augustoprøret (revision 11234324)
+1991 (revision 11250037)
+Afledte SI-enheder (revision 11097802)
+Gemeinsame Normdatei (revision 11281765)
+Litteraturvidenskab (revision 10931878)
+Thorvald Stauning (revision 11107677)
+Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830)
+Folkeforbundet (revision 11315450)
+Readymades (revision 10932287)
+Al Anbar (revision 9458175)
+2007 (revision 11250033)
+Varieté (revision 10934358)
+Damaskus (revision 11030795)
+Palæstina (revision 11311424)
+1569 (revision 10832219)
+Pædagog (revision 11251603)
+Carina Christensen (revision 11073847)
+Vest-Tyskland (revision 10580737)
+20. november (revision 6877846)
+Tessa Jowell (revision 11225831)
+Hillerød (revision 11317306)
+Påskekrisen (1920) (revision 11287865)
+Grad (vinkelmål) (revision 9624298)
+Kvinde (revision 11333939)
+1931 (revision 11236350)
+Afrikaans (revision 11080347)
+Den Store Danske Encyklopædi (revision 11301417)
+22. juni (revision 10375853)
+Automatic Train Control (revision 10619401)
+Luc Montagnier (revision 11162267)
+Reprise Records (revision 11081843)
+1966 (revision 11336105)
+Prosa (skriveform) (revision 11236012)
+Michael af Rumænien (revision 10819975)
+Mykolajiv (revision 11236676)
+Khmelnytskyj oblast (revision 11188686)
+Sierra Leones håndboldlandshold (herrer) (revision 11333322)
+1969 (revision 11340081)
+H. Edvard Hass (revision 10348478)
+Københavns Idrætspark (revision 9400386)
+Sanna Nielsen (revision 11315712)
+19. maj (revision 7148596)
+Patricia Schumann (revision 10952761)
+Torstenssonfejden (revision 11326728)
+International Standard Name Identifier (revision 10880739)
+Bent Mejding (revision 11335462)
+Afdeling Q (revision 11279134)
+Alfred Bindslev (revision 10398140)
+Sakser (revision 9042633)
+Folketingsmedlemmer valgt i 1998 (revision 11213304)
+1996 (revision 11229565)
+1 (tal) (revision 9378579)
+Farrah Fawcett (revision 10977527)
+Google+ (revision 10469085)
+1530 (revision 10865231)
+De største helte (revision 10737852)
+Afrikamesterskabet i håndbold 1974 (mænd) (revision 11018946)
+1902 (revision 11217211)
+ISO 639-3 (revision 10880691)
+1974 (revision 11336110)
+Dansk fonologi (revision 11226101)
+Europa (revision 11149054)
+Sovemedicin (revision 11327388)
+Slotsbryggen (Nykøbing Falster) (revision 11005548)
+Olieraffinaderi (revision 11322152)
+Slaget ved Stiklestad (revision 11261889)
+Rolling Stone (revision 11267586)
+Jørgen Hald (revision 10296412)
+Nikolaj Coster-Waldau (revision 11228953)
+Aserbajdsjan (revision 11297538)
+Kultstatus (revision 7820159)
+Al Kut (revision 9425606)
+Library of Congress Control Number (revision 8316539)
+Rwandas håndboldlandshold (herrer) (revision 11333322)
+Levon Helm (revision 11317127)
+Howard Hughes (revision 11040881)
+Wim Kieft (revision 10910953)
+Afrikamesterskabet i håndbold 2016 (mænd) (revision 11018957)
+24. februar (revision 10755036)
+Iværksætter (revision 10972242)
+1992 (revision 11303945)
+Internationalt Standardbognummer (revision 11037702)
+Afrikamesterskabet i håndbold 2014 (mænd) (revision 11018956)
+En mand kommer hjem (revision 10737861)
+Jamaica (revision 11243987)
+Henitjesk (revision 11328921)
+August (revision 11210562)
== End of Parsed pages ==
-- Wikipedia parsing ended at: 2016-02-19 17:56:42.162636
+- Wikipedia parsing ended at: 2022-11-30 19:41:17.518631
-53 characters appeared 1301488 times.
+60 characters appeared 1532370 times.
-First 30 characters:
-[ 0] Char e: 15.272749345364689 %
-[ 1] Char r: 8.48482659847805 %
-[ 2] Char n: 7.695652975670924 %
-[ 3] Char t: 6.977014002434137 %
-[ 4] Char a: 6.780469739252302 %
-[ 5] Char i: 6.164636170291236 %
-[ 6] Char s: 6.0942551909814 %
-[ 7] Char d: 5.953493232361728 %
-[ 8] Char l: 5.076650725938311 %
-[ 9] Char o: 4.883026197706011 %
-[10] Char g: 4.012253666572415 %
-[11] Char k: 3.232607599916403 %
-[12] Char m: 3.0863135119186653 %
-[13] Char f: 2.701600014752345 %
-[14] Char v: 2.13970470722742 %
-[15] Char b: 1.982423195603801 %
-[16] Char u: 1.8339777239590376 %
-[17] Char p: 1.5789619266562582 %
-[18] Char h: 1.3433085821767086 %
-[19] Char ø: 0.8730775850411222 %
-[20] Char y: 0.859938777768216 %
-[21] Char å: 0.7699648402443973 %
-[22] Char æ: 0.7208671920140639 %
-[23] Char j: 0.644108896893402 %
-[24] Char c: 0.5698093259407694 %
-[25] Char w: 0.11087309295206717 %
-[26] Char z: 0.05309307500338075 %
-[27] Char x: 0.032424424965885205 %
-[28] Char é: 0.032193919575132464 %
-[29] Char q: 0.012139950579644223 %
+First 31 characters:
+[ 0] Char e: 15.035728968852169 %
+[ 1] Char r: 8.617892545534042 %
+[ 2] Char n: 7.618264518360449 %
+[ 3] Char t: 6.856503324915001 %
+[ 4] Char a: 6.475133290262796 %
+[ 5] Char i: 6.3714377076032545 %
+[ 6] Char s: 6.279488635251278 %
+[ 7] Char d: 5.919523352715076 %
+[ 8] Char l: 5.094722553952375 %
+[ 9] Char o: 4.86860223053179 %
+[10] Char g: 3.8343872563414845 %
+[11] Char k: 3.3303314473658454 %
+[12] Char m: 3.2096034247603384 %
+[13] Char f: 2.608247355403721 %
+[14] Char v: 2.342188896937424 %
+[15] Char u: 1.9602967951604378 %
+[16] Char b: 1.9047619047619049 %
+[17] Char p: 1.5793183108518178 %
+[18] Char h: 1.45728512043436 %
+[19] Char ø: 0.8954103773892728 %
+[20] Char æ: 0.7449897870618715 %
+[21] Char å: 0.7295235484902471 %
+[22] Char y: 0.6777736447463732 %
+[23] Char j: 0.666418684782396 %
+[24] Char c: 0.5946344551250677 %
+[25] Char w: 0.12248999915164091 %
+[26] Char z: 0.06571519933175407 %
+[27] Char x: 0.045354581465311905 %
+[28] Char é: 0.021926819240783886 %
+[29] Char ó: 0.009592983417842949 %
+[30] Char q: 0.009397208246050236 %
-The first 30 characters have an accumulated ratio of 0.9997241618823994.
+The first 31 characters have an accumulated ratio of 0.9994694492844417.
-964 sequences found.
+1065 sequences found.
-First 512 (typical positive ratio): 0.9968082796759031
-Next 512 (512-1024): 7.68351302509128e-07
-Rest: 3.903127820947816e-17
+First 512 (typical positive ratio): 0.9958348814328518
+Next 512 (512-1024): 2.6103356239028435e-06
+Rest: 3.268948339453948e-05
-- Processing end: 2016-02-19 17:56:42.304278
+- Processing end: 2022-11-30 19:41:17.605842
diff --git a/script/charsets/ibm865.py b/script/charsets/ibm865.py
new file mode 100644
index 0000000..7fc3122
--- /dev/null
+++ b/script/charsets/ibm865.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'IBM865'
+aliases = ['CP865', '865', 'CSIBM865']
+
+language = \
+{
+ 'complete': [ 'no', 'da' ],
+ 'incomplete': []
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,SYM,SYM, # 9X
+ LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
+]
diff --git a/script/langs/da.py b/script/langs/da.py
index 18d2379..31226a0 100644
--- a/script/langs/da.py
+++ b/script/langs/da.py
@@ -50,7 +50,7 @@ code = 'da'
# ASCII characters are also used in French.
use_ascii = True
# The charsets we want to support and create data for.
-charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
+charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'IBM865' ]
## Optional Properties ##
diff --git a/script/langs/no.py b/script/langs/no.py
new file mode 100644
index 0000000..93cf23f
--- /dev/null
+++ b/script/langs/no.py
@@ -0,0 +1,55 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+## Mandatory Properties ##
+
+name = 'Norwegian'
+code = 'no'
+use_ascii = True
+charsets = ['IBM865', 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'æøåéìîàêÆØÅ'
+# Some pages that should contain most norwegian-norwegian norwegian
+start_pages = ['Norsk', 'Saft', 'Hund']
+wikipedia_code = code
+case_mapping = True