7SSDATWD4YOCDC3UQKKJI5T7E6HM6E4DEA4BFUAM5LAIT5WHEWAQC M2U4ANU7R2UXRYU2XMWNTFBV4WPHFAE6YJ5DS2MUAXG4DMGII6FAC S7AM7DDBCOCJRZDG2AT224KCGL4J47QOR2I6RNCE4M7SQQSHDHJAC BUQKLZRNOPFFB63UUIN3KAQLRTOOX52HZTDHYOCBCYNPEE6LMIJQC F5IPK5DFDXPC5HQPCAAKSY67JFBY7JI3JUO2SORNQTQ6TH257N6AC KJXDMDMJ4O4BMJL5HF2MY23OKBRZJ57JPD2BUBJXBUUQPMMBFZNAC 77ZDLLUTFQ3L3FELAWZQTOG5DUPINOXV5VYLEYBEONDOW2BSVZJAC PXJLVN77FE6X3RAQBMZTBRRIWTOA2XT7ESYUTHLUCA73DB2WJOMAC FIAMBM46OAJS5A33FNXF5MKSSK7BOUKAXRGS7Z5MDZK44ZVXPYSAC SDQM4ROF7EPX5G7Z4V37ZNWKWNC7ZHZQQ4ZVKLDZTLJSERCSC4CQC FCCU7MKCCJF3WF3ZEJPLOERQYWX42UXS4QCVM7AXH6O4S765KJYAC IRHRXNNTKIHRBLEAKRTINBLZVSDN43TZMB5QD6M55RBYJWHSGE4QC ICGDK5NMJH7PYZF3C6CAIXF6COSNX6ORYRGOPZNWJRRHWGZWMAHQC 56LYC22XXLMHGOXXDZ3V6ZVSPHXXBQ6XU75QSXJUW7PTKOMZHZ4QC WWZOH45TLD6X4YGTMXSAP4W7WBYBKVTNU3WYWMMCI6FJRVRUYMJAC 6YMH7UABZZQXOZWJOA4VKS63TRXJHBSZIZPYLNQYTBGXA7SRSV4AC N33V6HJBQJELE567YWQQDBXN5YZ555GQ32HJE6MV5QH2HUIWHPUAC ZAHK7S53I3RLV7NAY7CND3N5SDQMKFWARDRJFP6JP5HMTYLJZYIQC MU2YNCBOBEBDCFVQZZEMLQW6HF5AY53M62UYPIS6I2NVDPPFZKUQC WY2RZ3DJUNJTIH3UOCQUQZLVGMEEDLEJIIYTWZFCPMJ4CJKKXXTQC OA7ZVXKGIETV6MOKQGROPTQWZVZ3BUEXKFCYDU74UJS6FLH4QSBAC WWZTP44OMUGMTSNEG4YI6H2YTHAC4DLQMG6JHKXSVUKIWBDC7IGAC 2EZN4P6CWDDVJ2TNQJV7GVV26FORSVKVCP6II2IL2NXLRXCDUPSAC DIJO2HGF3PO7BSYOUZN2DMHU2SRR6CLQ245XKXSETZPJHF6RRKDAC YYDITCHVLMT2AE3P65STHJVHNG47BE3TSW3FNSXK2XCOB3VIRJQQC 7WUXJS4Q6T5YK322HTOLFHDKIZ5N7K5NRB3MMDTBLWSTXU7P5OEQC |> Stream.map(fn file -> File.stream!(Path.join(@priv_dir, file)) end)|> Stream.map(fn stream -> extract_book(stream) end)|> Stream.map(fn stream -> normalize_text(stream) end)|> Stream.map(&collect_sentences/1)|> Stream.map(&normalize_sentences/1)|> Stream.map(&filter_sentences/1)
|> Stream.map(fn file -> process_file(Path.join(@priv_dir, file)) end)
def sentence_to_words(sentence) dopermitted_symbols = MapSet.new(~w(' a b c ĉ d e f g ĝ h ĥ i j ĵ k l m n o p r s ŝ t u ŭ v z))
def process_file(file) dofp = File.open!(file)stream = IO.binstream(fp, :line) |> Stream.filter(fn l -> l !== "\n" end)t = get_title(stream)a = get_author(stream)
sentence|> String.downcase()|> String.codepoints()|> Enum.map(fn symb ->if MapSet.member?(permitted_symbols, symb) dosymbelse" "endend)|> List.to_string()|> String.split()|> Stream.filter(fn word -> String.length(word) > 1 end)|> Stream.map(fn word ->if String.ends_with?(word, "'") docase aliases[word] donil ->if String.starts_with?(word, "'") doString.trim(word, "'")elseif String.length(word) == 2 donilelse{word, String.replace(word, "'", "o")}endendals ->{word, als}endelsewordendend)|> Stream.filter(fn word -> word != nil end)enddefp get_files() doFile.ls!(@priv_dir)enddefp extract_book(stream) dostream|> Stream.transform(&parse_title/1, fn line, parser ->if parser == nil do{:halt, nil}elsecase parser.(line) do{nil, new_parser} -> {[], new_parser}{value, new_parser} -> {[value], new_parser}endendend)enddefp parse_title("Title: " <> title = _line) do{{:title, String.trim(title)}, &parse_author/1}enddefp parse_title(_) do{nil, &parse_title/1}enddefp parse_author("Author: " <> name = _line) do{{:author, String.trim(name)},skip_until(fn line -> String.starts_with?(line, "*** START OF") end,skip_lines(6,keep_until(fn line -> String.starts_with?(line, "End of") end)))}
defp skip_until(predicate, next) dofn l ->if predicate.(l) do{nil, next}else{nil, skip_until(predicate, next)}endendenddefp skip_lines(n, next) dofn _ ->if n == 0 do{nil, next}else{nil, skip_lines(n - 1, next)}endend
defp get_author(stream) dostream = stream |> Stream.drop_while(fn line -> !String.starts_with?(line, "Author:") end)[author] = Enum.take(stream, 1)"Author: " <> a = authora |> String.trim()
defp keep_until(predicate) dofn line ->if predicate.(line) do{nil, nil}else{{:line, line}, keep_until(predicate)}endendenddefp normalize_text(stream) dostream|> Stream.filter(fn{:line, " " <> _} -> false_ -> trueend)|> Stream.map(&strip/1)|> drop_empty()|> drop_absurdus()|> Stream.map(&remove_special/1)|> Stream.map(&strip_inside/1)enddefp strip({:line, line}) do{:line, String.trim(line)}enddefp strip(skiped), do: skipeddefp strip_inside({:line, line}) do{:line, String.replace(line, ~r/(\s+)/, " ")}enddefp strip_inside(skiped), do: skipeddefp drop_empty(stream) do
defp get_text(stream) do
|> Stream.filter(fn{:line, ""} -> false_ -> true
|> Stream.drop_while(fn line -> !String.starts_with?(line, "*** START OF THE") end)|> Stream.drop(1)|> Stream.take_while(fn line -> !String.starts_with?(line, "*** END OF THE") end)|> Stream.map(fn word -> String.normalize(word, :nfd) end)|> Stream.map(fn word ->String.replace(word, ~r/[!#$%&()*+,.:;<=>?@\^_`{|}~"\[\]«»\&]/, "")
defp is_absurdus("* * * * *"), do: truedefp is_absurdus("* * * * *"), do: truedefp is_absurdus(_), do: falsedefp remove_special({:line, line}) donew_line =line|> String.replace("_", "")|> String.replace("\"", "")|> String.replace(~r/\[.+\]/, ""){:line, new_line}enddefp remove_special(skipped), do: skippeddefp collect_sentences(stream) dostream|> Stream.transform([],fn{:line, line}, acc ->splitted = String.split(line, ~r/\.|\?|\!/)case splitted do[p] ->{[], [p | acc]}_ ->punctuations =Regex.scan(~r/\.|\?|\!/, line)|> List.flatten()last = List.last(splitted)first_sentence =[List.first(splitted) <> List.first(punctuations) | acc]|> Stream.map(fn l -> String.trim(l) end)|> Enum.reverse()|> Enum.join(" ")other_sentences =splitted|> Stream.zip(punctuations)|> Stream.drop(1)|> Stream.drop(-1)|> Enum.map(fn {s, p} -> {:sentence, s <> p} end)new_acc =if last == "" do[]else[last]end{[{:sentence, first_sentence}] ++ other_sentences, new_acc}enda, acc ->{[a], acc}end)enddefp normalize_sentences(stream) dostream|> Stream.map(fn{:sentence, s} ->{:sentence, String.trim(s)}other ->otherend)enddefp filter_sentences(stream) dostream|> Stream.filter(fn{:sentence, s} ->Enum.all?([&long_sentence?/1,¬_blacklisted?/1],fn chk -> chk.(s) end)_ ->trueend)enddefp long_sentence?(sentence) doString.length(sentence) > 2enddefp not_blacklisted?("(Rim.") dofalseenddefp not_blacklisted?(_sentence) dotrueenddef is_esperanto?(word) docheck_esperanto(word)enddefp check_esperanto(word) doEnum.any?([&is_alias?/1,&is_table_word?/1,&is_in_vocab?/1,&is_conjuction?/1,&is_pronoun?/1,&is_preposition?/1,&is_noun?/1,&is_adjective?/1,&is_adverb?/1,&is_verb?/1],fn f -> f.(word) end)enddefp is_alias?({_, _}), do: truedefp is_alias?(_), do: falsedefp is_noun?(word) doString.ends_with?(word, ["o", "oj", "on", "ojn"])enddefp is_adjective?(word) doString.ends_with?(word, ["a", "aj", "an", "ajn"])enddefp is_adverb?(word) doString.ends_with?(word, ["e", "en"])enddefp is_pronoun?(word) dobase = ~w(mi ni vi li ŝi ĝi ili oni si)Enum.any?(base, fn b ->word == b || word == b <> "a" || word == b <> "n" || word == b <> "aj" || word == b <> "an" ||word == b <> "ajn"end)enddefp is_preposition?(word) dopreps = ~w(alanstataŭantaŭapudĉeĉirkaŭdadeeksterelenĝisinterjekontraŭkrommalantaŭmaltranspopostpretersubsupersurtratrans)Enum.any?(preps, fn b -> word == b end)enddefp is_table_word?(word) dotws = ~w(kiotioioĉioneniokiontionionĉionnenionkiutiuiuĉiuneniukiuntiuniunĉiunneniunkiujtiujiujĉiujneniujkiujntiujniujnĉiujnneniujnkiatiaiaĉianeniakiantianianĉianneniankiajtiajiajĉiajneniajkiajntiajniajnĉiajnneniajnkiestiesiesĉiesnenieskietieieĉieneniekientienienĉiennenienkiamtiamiamĉiamneniamkialtialialĉialnenialkieltielielĉielnenielkiomtiomiomĉiomneniom)Enum.any?(tws, fn tw -> word == tw end)enddefp is_conjuction?(word) doEnum.any?(~w(kajaŭsedplusminusnek),fn conj -> conj == word end)enddefp is_verb?(word) doString.ends_with?(word, ["i", "as", "is", "os", "us", "u"])enddefp is_in_vocab?(word) dowords = MapSet.new(~w(ajnalmenaŭankaŭapenaŭdoeĉjajenkvazaŭmemnurpliplejpreskaŭtamentretrojesnetujporĉarmorgaŭhieraŭ))MapSet.member?(words, word)end
defp save({:author, author}, acc) doDB.set_author(acc.book_id, author)accend@desired_percentage 51defp save({:sentence, sentence}, acc) dowords = Esperantisto.Text.sentence_to_words(sentence)check =words|> Enum.reduce({[], []}, fn word, {esp, not_esp} ->case Esperantisto.Text.is_esperanto?(word) dotrue ->{[word | esp], not_esp}
if prev_word doDB.save_word_order(prev_word, word)end
false ->{esp, [word | not_esp]}endend)case check do{[], []} ->Logger.debug("Empty sentence: #{inspect(sentence)}")acc{esp, not_esp} ->len_esp = length(esp)len_not_esp = length(not_esp)total = len_esp + len_not_esppercentage = len_esp * 100 / totalif percentage >= @desired_percentage dosentence_id = DB.save_sentence(acc.book_id, sentence)words|> Task.async_stream(fn word ->DB.save_word(sentence_id, word)wordend,timeout: 30000,max_concurrency: 10)|> Stream.map(fn {:ok, v} -> v end)|> Stream.scan({nil, nil},fn word, {_, prev} -> {prev, word} end)|> Task.async_stream(fn {prev, word} ->DB.save_word_order(prev, word)end,timeout: 30000,max_concurrency: 10)|> Stream.run()accelseLogger.warn("Too many non esperanto words: #{len_not_esp}/#{total} (#{inspect(not_esp)}) in sentence #{sentence}")accendend
%State{acc | prev_word: word}
enddef save_sentence(book_id, sentence) doresult =Bolt.Sips.query!(Bolt.Sips.conn(),"""MATCH (book)WHERE id(book) = $idWITH bookMERGE (book) -[:HAS_SENTENCE]-> (sentence:Sentence {text: $sentence})RETURN id(sentence) as sentence_id""",%{id: book_id, sentence: sentence},timeout: 25000)[id] = result["sentence_id"]id
MATCH (w1:Word {word: $word1})MATCH (w2:Word {word: $word2})MERGE (w1) -[:ALIAS_OF]-> (w2)MERGE (w1) <-[:ALIAS_OF]- (w2)""",%{word1: word, word2: als},timeout: 25000)enddef save_word(sentence_id, word) doBolt.Sips.query!(Bolt.Sips.conn(),"""MATCH (sentence)WHERE id(sentence) = $idWITH sentence
MATCH (book)WHERE id(book) = $idWITH book
def save_word_order({prev_1, prev_2}, next) dosave_word_order(prev_1, next)save_word_order(prev_2, next)enddef save_word_order(prev, {next_1, next_2}) dosave_word_order(prev, next_1)save_word_order(prev, next_2)enddef save_word_order({prev_1, prev_2}, {_next_1, _next_2} = next) dosave_word_order(prev_1, next)save_word_order(prev_2, next)end