from string import ascii_letters
from typing import List, Optional
class Language:
def __init__(
self,
name: Optional[str] = None,
iso_code: Optional[str] = None,
use_ascii: bool = True,
charsets: Optional[List[str]] = None,
alphabet: Optional[str] = None,
wiki_start_pages: Optional[List[str]] = None,
) -> None:
super().__init__()
self.name = name
self.iso_code = iso_code
self.use_ascii = use_ascii
self.charsets = charsets
if self.use_ascii:
if alphabet:
alphabet += ascii_letters
else:
alphabet = ascii_letters
elif not alphabet:
raise ValueError("Must supply alphabet if use_ascii is False")
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages
def __repr__(self) -> str:
param_str = ", ".join(
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
)
return f"{self.__class__.__name__}({param_str})"
LANGUAGES = {
"Arabic": Language(
name="Arabic",
iso_code="ar",
use_ascii=False,
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
wiki_start_pages=["الصفحة_الرئيسية"],
),
"Belarusian": Language(
name="Belarusian",
iso_code="be",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
wiki_start_pages=["Галоўная_старонка"],
),
"Bulgarian": Language(
name="Bulgarian",
iso_code="bg",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
wiki_start_pages=["Начална_страница"],
),
"Czech": Language(
name="Czech",
iso_code="cz",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
wiki_start_pages=["Hlavní_strana"],
),
"Danish": Language(
name="Danish",
iso_code="da",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="æøåÆØÅ",
wiki_start_pages=["Forside"],
),
"German": Language(
name="German",
iso_code="de",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="äöüßẞÄÖÜ",
wiki_start_pages=["Wikipedia:Hauptseite"],
),
"Greek": Language(
name="Greek",
iso_code="el",
use_ascii=False,
charsets=["ISO-8859-7", "WINDOWS-1253"],
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
wiki_start_pages=["Πύλη:Κύρια"],
),
"English": Language(
name="English",
iso_code="en",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Main_Page"],
),
"Esperanto": Language(
name="Esperanto",
iso_code="eo",
use_ascii=False,
charsets=["ISO-8859-3"],
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
),
"Spanish": Language(
name="Spanish",
iso_code="es",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
wiki_start_pages=["Wikipedia:Portada"],
),
"Estonian": Language(
name="Estonian",
iso_code="et",
use_ascii=False,
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
wiki_start_pages=["Esileht"],
),
"Finnish": Language(
name="Finnish",
iso_code="fi",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÅÄÖŠŽåäöšž",
wiki_start_pages=["Wikipedia:Etusivu"],
),
"French": Language(
name="French",
iso_code="fr",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
),
"Hebrew": Language(
name="Hebrew",
iso_code="he",
use_ascii=False,
charsets=["ISO-8859-8", "WINDOWS-1255"],
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
wiki_start_pages=["עמוד_ראשי"],
),
"Croatian": Language(
name="Croatian",
iso_code="hr",
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
wiki_start_pages=["Glavna_stranica"],
),
"Hungarian": Language(
name="Hungarian",
iso_code="hu",
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
wiki_start_pages=["Kezdőlap"],
),
"Italian": Language(
name="Italian",
iso_code="it",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
wiki_start_pages=["Pagina_principale"],
),
"Lithuanian": Language(
name="Lithuanian",
iso_code="lt",
use_ascii=False,
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
wiki_start_pages=["Pagrindinis_puslapis"],
),
"Latvian": Language(
name="Latvian",
iso_code="lv",
use_ascii=False,
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
wiki_start_pages=["Sākumlapa"],
),
"Macedonian": Language(
name="Macedonian",
iso_code="mk",
use_ascii=False,
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
wiki_start_pages=["Главна_страница"],
),
"Dutch": Language(
name="Dutch",
iso_code="nl",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Hoofdpagina"],
),
"Polish": Language(
name="Polish",
iso_code="pl",
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
wiki_start_pages=["Wikipedia:Strona_główna"],
),
"Portuguese": Language(
name="Portuguese",
iso_code="pt",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
wiki_start_pages=["Wikipédia:Página_principal"],
),
"Romanian": Language(
name="Romanian",
iso_code="ro",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="ăâîșțĂÂÎȘȚ",
wiki_start_pages=["Pagina_principală"],
),
"Russian": Language(
name="Russian",
iso_code="ru",
use_ascii=False,
charsets=[
"ISO-8859-5",
"WINDOWS-1251",
"KOI8-R",
"MacCyrillic",
"IBM866",
"IBM855",
],
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
wiki_start_pages=["Заглавная_страница"],
),
"Slovak": Language(
name="Slovak",
iso_code="sk",
use_ascii=True,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
wiki_start_pages=["Hlavná_stránka"],
),
"Slovene": Language(
name="Slovene",
iso_code="sl",
use_ascii=False,
charsets=["ISO-8859-2", "WINDOWS-1250"],
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
wiki_start_pages=["Glavna_stran"],
),
"Serbian": Language(
name="Serbian",
iso_code="sr",
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
wiki_start_pages=["Главна_страна"],
),
"Thai": Language(
name="Thai",
iso_code="th",
use_ascii=False,
charsets=["ISO-8859-11", "TIS-620", "CP874"],
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
wiki_start_pages=["หน้าหลัก"],
),
"Turkish": Language(
name="Turkish",
iso_code="tr",
use_ascii=False,
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
wiki_start_pages=["Ana_Sayfa"],
),
"Vietnamese": Language(
name="Vietnamese",
iso_code="vi",
use_ascii=False,
charsets=["WINDOWS-1258"],
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
wiki_start_pages=["Chữ_Quốc_ngữ"],
),
}