3PGDQGVPBKZ3T4RJTKB7VZFF7UZFOL6CUFX6TCGOHLOG5UIQYS6AC
RU7OZCKOPESUNP4BO3TD4FRUABITBVUFZEZOGJWG243QYB2FZJMQC
7SSDATWD4YOCDC3UQKKJI5T7E6HM6E4DEA4BFUAM5LAIT5WHEWAQC
77ZDLLUTFQ3L3FELAWZQTOG5DUPINOXV5VYLEYBEONDOW2BSVZJAC
OA7ZVXKGIETV6MOKQGROPTQWZVZ3BUEXKFCYDU74UJS6FLH4QSBAC
DIJO2HGF3PO7BSYOUZN2DMHU2SRR6CLQ245XKXSETZPJHF6RRKDAC
WWZOH45TLD6X4YGTMXSAP4W7WBYBKVTNU3WYWMMCI6FJRVRUYMJAC
ICGDK5NMJH7PYZF3C6CAIXF6COSNX6ORYRGOPZNWJRRHWGZWMAHQC
|> Stream.map(fn word -> String.normalize(word, :nfd) end)
|> Stream.map(fn word ->
String.replace(word, ~r/[!#$%&()*+,.:;<=>?@\^_`{|}~"\[\]«»\&„“]/, "")
|> Stream.chunk_while(
[],
fn elem, acc ->
case elem do
"\n" ->
{:cont, Enum.reverse(acc), []}
_ ->
{:cont,
[
elem |> String.trim() |> String.split() |> Enum.join(" ")
| acc
]}
end
end,
fn
[] -> {:cont, []}
acc -> {:cont, Enum.reverse(acc), []}
end
)
|> Stream.filter(fn p -> p != [] end)
|> Stream.map(fn
[l] ->
l
p ->
Enum.join(p, " ")
|> Stream.map(fn word -> String.normalize(word, :nfc) end)
|> Stream.flat_map(&String.split/1)
|> Stream.map(&String.downcase/1)
end
def words(str) do
str
|> String.codepoints()
|> Stream.filter(fn s ->
s not in [
"!",
"#",
"$",
"%",
"&",
"(",
")",
"*",
"+",
",",
".",
":",
";",
"<",
"=",
">",
"?",
"@",
"^",
"_",
"`",
"{",
"|",
"}",
"~",
"\"",
"[",
"]",
"«",
"»",
"&",
"„",
"“"
]
end)
|> Enum.join()
|> String.downcase()
|> String.split()
|> Stream.map(fn w -> String.trim_leading(w, "-") end)
|> Enum.filter(fn w -> w != "" end)
|> Stream.scan(%State{book_id: book_id}, &save/2)
|> Task.async_stream(fn p -> process_paragraph(p, book_id) end,
timeout: :infinity
)
|> Stream.run()
end
defp process_paragraph(paragraph, book_id) do
paragraph_id = DB.save_paragraph(book_id, paragraph)
paragraph
|> Esperantisto.Text.words()
|> Stream.scan(%State{paragraph_id: paragraph_id}, &save_word/2)
def save_paragraph(book_id, paragraph) do
result =
Bolt.Sips.query!(
Bolt.Sips.conn(),
"""
MATCH (book)
WHERE id(book) = $id
WITH book
MERGE (book) -[:HAS_PARAGRAPH]-> (p:Paragraph {text: $paragraph})
RETURN id(p) as paragraph_id
""",
%{id: book_id, paragraph: paragraph},
timeout: 25000
)