7SSDATWD4YOCDC3UQKKJI5T7E6HM6E4DEA4BFUAM5LAIT5WHEWAQC
M2U4ANU7R2UXRYU2XMWNTFBV4WPHFAE6YJ5DS2MUAXG4DMGII6FAC
S7AM7DDBCOCJRZDG2AT224KCGL4J47QOR2I6RNCE4M7SQQSHDHJAC
BUQKLZRNOPFFB63UUIN3KAQLRTOOX52HZTDHYOCBCYNPEE6LMIJQC
F5IPK5DFDXPC5HQPCAAKSY67JFBY7JI3JUO2SORNQTQ6TH257N6AC
KJXDMDMJ4O4BMJL5HF2MY23OKBRZJ57JPD2BUBJXBUUQPMMBFZNAC
77ZDLLUTFQ3L3FELAWZQTOG5DUPINOXV5VYLEYBEONDOW2BSVZJAC
PXJLVN77FE6X3RAQBMZTBRRIWTOA2XT7ESYUTHLUCA73DB2WJOMAC
FIAMBM46OAJS5A33FNXF5MKSSK7BOUKAXRGS7Z5MDZK44ZVXPYSAC
SDQM4ROF7EPX5G7Z4V37ZNWKWNC7ZHZQQ4ZVKLDZTLJSERCSC4CQC
FCCU7MKCCJF3WF3ZEJPLOERQYWX42UXS4QCVM7AXH6O4S765KJYAC
IRHRXNNTKIHRBLEAKRTINBLZVSDN43TZMB5QD6M55RBYJWHSGE4QC
ICGDK5NMJH7PYZF3C6CAIXF6COSNX6ORYRGOPZNWJRRHWGZWMAHQC
56LYC22XXLMHGOXXDZ3V6ZVSPHXXBQ6XU75QSXJUW7PTKOMZHZ4QC
WWZOH45TLD6X4YGTMXSAP4W7WBYBKVTNU3WYWMMCI6FJRVRUYMJAC
6YMH7UABZZQXOZWJOA4VKS63TRXJHBSZIZPYLNQYTBGXA7SRSV4AC
N33V6HJBQJELE567YWQQDBXN5YZ555GQ32HJE6MV5QH2HUIWHPUAC
ZAHK7S53I3RLV7NAY7CND3N5SDQMKFWARDRJFP6JP5HMTYLJZYIQC
MU2YNCBOBEBDCFVQZZEMLQW6HF5AY53M62UYPIS6I2NVDPPFZKUQC
WY2RZ3DJUNJTIH3UOCQUQZLVGMEEDLEJIIYTWZFCPMJ4CJKKXXTQC
OA7ZVXKGIETV6MOKQGROPTQWZVZ3BUEXKFCYDU74UJS6FLH4QSBAC
WWZTP44OMUGMTSNEG4YI6H2YTHAC4DLQMG6JHKXSVUKIWBDC7IGAC
2EZN4P6CWDDVJ2TNQJV7GVV26FORSVKVCP6II2IL2NXLRXCDUPSAC
DIJO2HGF3PO7BSYOUZN2DMHU2SRR6CLQ245XKXSETZPJHF6RRKDAC
YYDITCHVLMT2AE3P65STHJVHNG47BE3TSW3FNSXK2XCOB3VIRJQQC
7WUXJS4Q6T5YK322HTOLFHDKIZ5N7K5NRB3MMDTBLWSTXU7P5OEQC
|> Stream.map(fn file -> File.stream!(Path.join(@priv_dir, file)) end)
|> Stream.map(fn stream -> extract_book(stream) end)
|> Stream.map(fn stream -> normalize_text(stream) end)
|> Stream.map(&collect_sentences/1)
|> Stream.map(&normalize_sentences/1)
|> Stream.map(&filter_sentences/1)
|> Stream.map(fn file -> process_file(Path.join(@priv_dir, file)) end)
def sentence_to_words(sentence) do
permitted_symbols = MapSet.new(~w(' a b c ĉ d e f g ĝ h ĥ i j ĵ k l m n o p r s ŝ t u ŭ v z))
def process_file(file) do
fp = File.open!(file)
stream = IO.binstream(fp, :line) |> Stream.filter(fn l -> l !== "\n" end)
t = get_title(stream)
a = get_author(stream)
sentence
|> String.downcase()
|> String.codepoints()
|> Enum.map(fn symb ->
if MapSet.member?(permitted_symbols, symb) do
symb
else
" "
end
end)
|> List.to_string()
|> String.split()
|> Stream.filter(fn word -> String.length(word) > 1 end)
|> Stream.map(fn word ->
if String.ends_with?(word, "'") do
case aliases[word] do
nil ->
if String.starts_with?(word, "'") do
String.trim(word, "'")
else
if String.length(word) == 2 do
nil
else
{word, String.replace(word, "'", "o")}
end
end
als ->
{word, als}
end
else
word
end
end)
|> Stream.filter(fn word -> word != nil end)
end
defp get_files() do
File.ls!(@priv_dir)
end
defp extract_book(stream) do
stream
|> Stream.transform(&parse_title/1, fn line, parser ->
if parser == nil do
{:halt, nil}
else
case parser.(line) do
{nil, new_parser} -> {[], new_parser}
{value, new_parser} -> {[value], new_parser}
end
end
end)
end
defp parse_title("Title: " <> title = _line) do
{{:title, String.trim(title)}, &parse_author/1}
end
defp parse_title(_) do
{nil, &parse_title/1}
end
defp parse_author("Author: " <> name = _line) do
{{:author, String.trim(name)},
skip_until(
fn line -> String.starts_with?(line, "*** START OF") end,
skip_lines(
6,
keep_until(fn line -> String.starts_with?(line, "End of") end)
)
)}
defp skip_until(predicate, next) do
fn l ->
if predicate.(l) do
{nil, next}
else
{nil, skip_until(predicate, next)}
end
end
end
defp skip_lines(n, next) do
fn _ ->
if n == 0 do
{nil, next}
else
{nil, skip_lines(n - 1, next)}
end
end
defp get_author(stream) do
stream = stream |> Stream.drop_while(fn line -> !String.starts_with?(line, "Author:") end)
[author] = Enum.take(stream, 1)
"Author: " <> a = author
a |> String.trim()
defp keep_until(predicate) do
fn line ->
if predicate.(line) do
{nil, nil}
else
{{:line, line}, keep_until(predicate)}
end
end
end
defp normalize_text(stream) do
stream
|> Stream.filter(fn
{:line, " " <> _} -> false
_ -> true
end)
|> Stream.map(&strip/1)
|> drop_empty()
|> drop_absurdus()
|> Stream.map(&remove_special/1)
|> Stream.map(&strip_inside/1)
end
defp strip({:line, line}) do
{:line, String.trim(line)}
end
defp strip(skiped), do: skiped
defp strip_inside({:line, line}) do
{:line, String.replace(line, ~r/(\s+)/, " ")}
end
defp strip_inside(skiped), do: skiped
defp drop_empty(stream) do
defp get_text(stream) do
|> Stream.filter(fn
{:line, ""} -> false
_ -> true
|> Stream.drop_while(fn line -> !String.starts_with?(line, "*** START OF THE") end)
|> Stream.drop(1)
|> Stream.take_while(fn line -> !String.starts_with?(line, "*** END OF THE") end)
|> Stream.map(fn word -> String.normalize(word, :nfd) end)
|> Stream.map(fn word ->
String.replace(word, ~r/[!#$%&()*+,.:;<=>?@\^_`{|}~"\[\]«»\&]/, "")
defp is_absurdus("* * * * *"), do: true
defp is_absurdus("* * * * *"), do: true
defp is_absurdus(_), do: false
defp remove_special({:line, line}) do
new_line =
line
|> String.replace("_", "")
|> String.replace("\"", "")
|> String.replace(~r/\[.+\]/, "")
{:line, new_line}
end
defp remove_special(skipped), do: skipped
defp collect_sentences(stream) do
stream
|> Stream.transform(
[],
fn
{:line, line}, acc ->
splitted = String.split(line, ~r/\.|\?|\!/)
case splitted do
[p] ->
{[], [p | acc]}
_ ->
punctuations =
Regex.scan(~r/\.|\?|\!/, line)
|> List.flatten()
last = List.last(splitted)
first_sentence =
[List.first(splitted) <> List.first(punctuations) | acc]
|> Stream.map(fn l -> String.trim(l) end)
|> Enum.reverse()
|> Enum.join(" ")
other_sentences =
splitted
|> Stream.zip(punctuations)
|> Stream.drop(1)
|> Stream.drop(-1)
|> Enum.map(fn {s, p} -> {:sentence, s <> p} end)
new_acc =
if last == "" do
[]
else
[last]
end
{[{:sentence, first_sentence}] ++ other_sentences, new_acc}
end
a, acc ->
{[a], acc}
end
)
end
defp normalize_sentences(stream) do
stream
|> Stream.map(fn
{:sentence, s} ->
{:sentence, String.trim(s)}
other ->
other
end)
end
defp filter_sentences(stream) do
stream
|> Stream.filter(fn
{:sentence, s} ->
Enum.all?(
[
&long_sentence?/1,
¬_blacklisted?/1
],
fn chk -> chk.(s) end
)
_ ->
true
end)
end
defp long_sentence?(sentence) do
String.length(sentence) > 2
end
defp not_blacklisted?("(Rim.") do
false
end
defp not_blacklisted?(_sentence) do
true
end
def is_esperanto?(word) do
check_esperanto(word)
end
defp check_esperanto(word) do
Enum.any?(
[
&is_alias?/1,
&is_table_word?/1,
&is_in_vocab?/1,
&is_conjuction?/1,
&is_pronoun?/1,
&is_preposition?/1,
&is_noun?/1,
&is_adjective?/1,
&is_adverb?/1,
&is_verb?/1
],
fn f -> f.(word) end
)
end
defp is_alias?({_, _}), do: true
defp is_alias?(_), do: false
defp is_noun?(word) do
String.ends_with?(word, ["o", "oj", "on", "ojn"])
end
defp is_adjective?(word) do
String.ends_with?(word, ["a", "aj", "an", "ajn"])
end
defp is_adverb?(word) do
String.ends_with?(word, ["e", "en"])
end
defp is_pronoun?(word) do
base = ~w(mi ni vi li ŝi ĝi ili oni si)
Enum.any?(base, fn b ->
word == b || word == b <> "a" || word == b <> "n" || word == b <> "aj" || word == b <> "an" ||
word == b <> "ajn"
end)
end
defp is_preposition?(word) do
preps = ~w(
al
anstataŭ
antaŭ
apud
ĉe
ĉirkaŭ
da
de
ekster
el
en
ĝis
inter
je
kontraŭ
krom
malantaŭ
maltrans
po
post
preter
sub
super
sur
tra
trans
)
Enum.any?(preps, fn b -> word == b end)
end
defp is_table_word?(word) do
tws = ~w(
kio
tio
io
ĉio
nenio
kion
tion
ion
ĉion
nenion
kiu
tiu
iu
ĉiu
neniu
kiun
tiun
iun
ĉiun
neniun
kiuj
tiuj
iuj
ĉiuj
neniuj
kiujn
tiujn
iujn
ĉiujn
neniujn
kia
tia
ia
ĉia
nenia
kian
tian
ian
ĉian
nenian
kiaj
tiaj
iaj
ĉiaj
neniaj
kiajn
tiajn
iajn
ĉiajn
neniajn
kies
ties
ies
ĉies
nenies
kie
tie
ie
ĉie
nenie
kien
tien
ien
ĉien
nenien
kiam
tiam
iam
ĉiam
neniam
kial
tial
ial
ĉial
nenial
kiel
tiel
iel
ĉiel
neniel
kiom
tiom
iom
ĉiom
neniom
)
Enum.any?(tws, fn tw -> word == tw end)
end
defp is_conjuction?(word) do
Enum.any?(
~w(
kaj
aŭ
sed
plus
minus
nek
),
fn conj -> conj == word end
)
end
defp is_verb?(word) do
String.ends_with?(word, ["i", "as", "is", "os", "us", "u"])
end
defp is_in_vocab?(word) do
words = MapSet.new(~w(
ajn
almenaŭ
ankaŭ
apenaŭ
do
eĉ
ja
jen
kvazaŭ
mem
nur
pli
plej
preskaŭ
tamen
tre
tro
jes
ne
tuj
por
ĉar
morgaŭ
hieraŭ
))
MapSet.member?(words, word)
end
defp save({:author, author}, acc) do
DB.set_author(acc.book_id, author)
acc
end
@desired_percentage 51
defp save({:sentence, sentence}, acc) do
words = Esperantisto.Text.sentence_to_words(sentence)
check =
words
|> Enum.reduce({[], []}, fn word, {esp, not_esp} ->
case Esperantisto.Text.is_esperanto?(word) do
true ->
{[word | esp], not_esp}
if prev_word do
DB.save_word_order(prev_word, word)
end
false ->
{esp, [word | not_esp]}
end
end)
case check do
{[], []} ->
Logger.debug("Empty sentence: #{inspect(sentence)}")
acc
{esp, not_esp} ->
len_esp = length(esp)
len_not_esp = length(not_esp)
total = len_esp + len_not_esp
percentage = len_esp * 100 / total
if percentage >= @desired_percentage do
sentence_id = DB.save_sentence(acc.book_id, sentence)
words
|> Task.async_stream(
fn word ->
DB.save_word(sentence_id, word)
word
end,
timeout: 30000,
max_concurrency: 10
)
|> Stream.map(fn {:ok, v} -> v end)
|> Stream.scan(
{nil, nil},
fn word, {_, prev} -> {prev, word} end
)
|> Task.async_stream(
fn {prev, word} ->
DB.save_word_order(prev, word)
end,
timeout: 30000,
max_concurrency: 10
)
|> Stream.run()
acc
else
Logger.warn(
"Too many non esperanto words: #{len_not_esp}/#{total} (#{inspect(not_esp)}) in sentence #{sentence}"
)
acc
end
end
%State{acc | prev_word: word}
end
def save_sentence(book_id, sentence) do
result =
Bolt.Sips.query!(
Bolt.Sips.conn(),
"""
MATCH (book)
WHERE id(book) = $id
WITH book
MERGE (book) -[:HAS_SENTENCE]-> (sentence:Sentence {text: $sentence})
RETURN id(sentence) as sentence_id
""",
%{id: book_id, sentence: sentence},
timeout: 25000
)
[id] = result["sentence_id"]
id
MATCH (w1:Word {word: $word1})
MATCH (w2:Word {word: $word2})
MERGE (w1) -[:ALIAS_OF]-> (w2)
MERGE (w1) <-[:ALIAS_OF]- (w2)
""",
%{word1: word, word2: als},
timeout: 25000
)
end
def save_word(sentence_id, word) do
Bolt.Sips.query!(
Bolt.Sips.conn(),
"""
MATCH (sentence)
WHERE id(sentence) = $id
WITH sentence
MATCH (book)
WHERE id(book) = $id
WITH book
def save_word_order({prev_1, prev_2}, next) do
save_word_order(prev_1, next)
save_word_order(prev_2, next)
end
def save_word_order(prev, {next_1, next_2}) do
save_word_order(prev, next_1)
save_word_order(prev, next_2)
end
def save_word_order({prev_1, prev_2}, {_next_1, _next_2} = next) do
save_word_order(prev_1, next)
save_word_order(prev_2, next)
end