couchemar/esperantisto - Change 7SSDATWD4YOCDC3UQKKJI5T7E6HM6E4DEA4BFUAM5LAIT5WHEWAQC

Simpligas libroprilaboradon

Created by couchemar on December 30, 2021

7SSDATWD4YOCDC3UQKKJI5T7E6HM6E4DEA4BFUAM5LAIT5WHEWAQC

Dependencies

In channels

main

Change contents

Insertion in lib/esperantisto/text/text.ex at line 4 [7.669]
[7.763]
[7.815]
```
  defmodule Book do
    defstruct [:title, :author, :text]
  end
```

Replacement in lib/esperantisto/text/text.ex at line 10 [7.669]

B:BD[7.853] → [7.853:928]

B:BD[7.928] → [8.0:57]

B:BD[8.57] → [9.0:59]

B:BD[9.59] → [10.0:40]

B:BD[10.40] → [6.97:139]

∅:D[6.139] → [11.0:39]

B:BD[10.40] → [11.0:39]

    |> Stream.map(fn file -> File.stream!(Path.join(@priv_dir, file)) end)
    |> Stream.map(fn stream -> extract_book(stream) end)
    |> Stream.map(fn stream -> normalize_text(stream) end)
    |> Stream.map(&collect_sentences/1)
    |> Stream.map(&normalize_sentences/1)
    |> Stream.map(&filter_sentences/1)

[7.853]

[12.0]

    |> Stream.map(fn file -> process_file(Path.join(@priv_dir, file)) end)

Replacement in lib/esperantisto/text/text.ex at line 13 [7.669]

B:BD[12.7] → [13.74:111]

B:BD[13.111] → [6.140:244]

  def sentence_to_words(sentence) do
    permitted_symbols = MapSet.new(~w(' a b c ĉ d e f g ĝ h ĥ i j ĵ k l m n o p r s ŝ t u ŭ v z))

[12.7]

[14.300]

  def process_file(file) do
    fp = File.open!(file)
    stream = IO.binstream(fp, :line) |> Stream.filter(fn l -> l !== "\n" end)
    t = get_title(stream)
    a = get_author(stream)

Replacement in lib/esperantisto/text/text.ex at line 19 [7.669]

∅:D[14.301] → [15.0:37]

B:BD[13.111] → [15.0:37]

B:BD[15.37] → [5.0:47]

    aliases = %{
      "l'" => "la",
      "dank'" => "danke",
      "un'" => "unu"

[14.301]

[15.62]

    %Book{
      title: t,
      author: a,
      text: get_text(stream)

Deletion in lib/esperantisto/text/text.ex at line 24 [7.669]

B:BD[15.68] → [15.68:69]

∅:D[15.69] → [13.111:124]

B:BD[13.111] → [13.111:124]

B:BD[13.124] → [6.245:455]

∅:D[16.34] → [13.124:146]

∅:D[17.340] → [13.124:146]

∅:D[14.350] → [13.124:146]

∅:D[6.455] → [13.124:146]

B:BD[13.124] → [13.124:146]

B:BD[13.211] → [18.0:61]

B:BD[13.249] → [15.70:99]

B:BD[15.99] → [19.0:191]

B:BD[19.191] → [5.48:205]

∅:D[5.205] → [19.209:303]

B:BD[19.209] → [19.209:303]

∅:D[19.303] → [15.174:193]

B:BD[15.174] → [15.174:193]

B:BD[15.193] → [19.304:353]

∅:D[20.102] → [13.249:256]

∅:D[15.193] → [13.249:256]

∅:D[19.353] → [13.249:256]

B:BD[13.249] → [13.249:256]

∅:D[13.256] → [12.7:53]

B:BD[12.7] → [12.7:53]

∅:D[12.53] → [7.985:992]

∅:D[8.57] → [7.985:992]

B:BD[7.985] → [7.985:992]

B:BD[7.992] → [8.58:89]

∅:D[8.89] → [7.1023:1315]

B:BD[7.1023] → [7.1023:1315]

B:BD[7.1360] → [7.1360:1417]

B:BD[7.1417] → [8.114:166]

∅:D[8.166] → [7.1456:1572]

B:BD[7.1456] → [7.1456:1572]

B:BD[7.1572] → [8.167:404]


    sentence
    |> String.downcase()
    |> String.codepoints()
    |> Enum.map(fn symb ->
      if MapSet.member?(permitted_symbols, symb) do
        symb
      else
        " "
      end
    end)
    |> List.to_string()
    |> String.split()
    |> Stream.filter(fn word -> String.length(word) > 1 end)
    |> Stream.map(fn word ->
      if String.ends_with?(word, "'") do
        case aliases[word] do
          nil ->
            if String.starts_with?(word, "'") do
              String.trim(word, "'")
            else
              if String.length(word) == 2 do
                nil
              else
                {word, String.replace(word, "'", "o")}
              end
            end
          als ->
            {word, als}
        end
      else
        word
      end
    end)
    |> Stream.filter(fn word -> word != nil end)
  end
  defp get_files() do
    File.ls!(@priv_dir)
  end
  defp extract_book(stream) do
    stream
    |> Stream.transform(&parse_title/1, fn line, parser ->
      if parser == nil do
        {:halt, nil}
      else
        case parser.(line) do
          {nil, new_parser} -> {[], new_parser}
          {value, new_parser} -> {[value], new_parser}
        end
      end
    end)
  end
  defp parse_title("Title: " <> title = _line) do
    {{:title, String.trim(title)}, &parse_author/1}
  end
  defp parse_title(_) do
    {nil, &parse_title/1}
  end
  defp parse_author("Author: " <> name = _line) do
    {{:author, String.trim(name)},
     skip_until(
       fn line -> String.starts_with?(line, "*** START OF") end,
       skip_lines(
         6,
         keep_until(fn line -> String.starts_with?(line, "End of") end)
       )
     )}

Replacement in lib/esperantisto/text/text.ex at line 26 [7.669]

B:BD[7.1614] → [7.1614:1667]

  defp parse_author(_) do
    {nil, &parse_author/1}

[7.1614]

[7.1667]

  defp get_title(stream) do
    stream = stream |> Stream.drop_while(fn line -> !String.starts_with?(line, "Title:") end)
    [title] = Enum.take(stream, 1)
    "Title: " <> t = title
    t |> String.trim()

Replacement in lib/esperantisto/text/text.ex at line 33 [7.669]

B:BD[7.1674] → [8.405:729]

  defp skip_until(predicate, next) do
    fn l ->
      if predicate.(l) do
        {nil, next}
      else
        {nil, skip_until(predicate, next)}
      end
    end
  end
  defp skip_lines(n, next) do
    fn _ ->
      if n == 0 do
        {nil, next}
      else
        {nil, skip_lines(n - 1, next)}
      end
    end

[7.1674]

[8.729]

  defp get_author(stream) do
    stream = stream |> Stream.drop_while(fn line -> !String.starts_with?(line, "Author:") end)
    [author] = Enum.take(stream, 1)
    "Author: " <> a = author
    a |> String.trim()

Replacement in lib/esperantisto/text/text.ex at line 40 [7.669]

B:BD[8.736] → [8.736:907]

B:BD[8.907] → [21.0:51]

B:BD[21.51] → [22.0:89]

∅:D[22.89] → [21.51:159]

B:BD[21.51] → [21.51:159]

B:BD[21.159] → [3.0:35]

∅:D[3.35] → [21.159:228]

B:BD[21.159] → [21.159:228]

∅:D[21.228] → [7.1723:1729]

∅:D[8.907] → [7.1723:1729]

B:BD[7.1723] → [7.1723:1729]

B:BD[7.1729] → [21.229:264]

B:BD[21.264] → [3.36:172]

∅:D[3.172] → [21.264:293]

B:BD[21.264] → [21.264:293]

  defp keep_until(predicate) do
    fn line ->
      if predicate.(line) do
        {nil, nil}
      else
        {{:line, line}, keep_until(predicate)}
      end
    end
  end
  defp normalize_text(stream) do
    stream
    |> Stream.filter(fn
      {:line, "        " <> _} -> false
      _ -> true
    end)
    |> Stream.map(&strip/1)
    |> drop_empty()
    |> drop_absurdus()
    |> Stream.map(&remove_special/1)
    |> Stream.map(&strip_inside/1)
  end
  defp strip({:line, line}) do
    {:line, String.trim(line)}
  end
  defp strip(skiped), do: skiped
  defp strip_inside({:line, line}) do
    {:line, String.replace(line, ~r/(\s+)/, " ")}
  end
  defp strip_inside(skiped), do: skiped
  defp drop_empty(stream) do

[8.736]

[21.293]

  defp get_text(stream) do

Replacement in lib/esperantisto/text/text.ex at line 42 [7.669]

B:BD[21.304] → [21.304:371]

    |> Stream.filter(fn
      {:line, ""} -> false
      _ -> true

[21.304]

[21.371]

    |> Stream.drop_while(fn line -> !String.starts_with?(line, "*** START OF THE") end)
    |> Stream.drop(1)
    |> Stream.take_while(fn line -> !String.starts_with?(line, "*** END OF THE") end)
    |> Stream.map(fn word -> String.normalize(word, :nfd) end)
    |> Stream.map(fn word ->
      String.replace(word, ~r/[!#$%&()*+,.:;<=>?@\^_`{|}~"\[\]«»\&]/, "")

Insertion in lib/esperantisto/text/text.ex at line 49 [7.669]

[21.380]

    |> Stream.map(fn word -> String.normalize(word, :nfc) end)
    |> Stream.flat_map(&String.split/1)
    |> Stream.map(&String.downcase/1)

Replacement in lib/esperantisto/text/text.ex at line 54 [7.669]

B:BD[21.387] → [21.387:524]

  defp drop_absurdus(stream) do
    stream
    |> Stream.filter(fn
      {:line, line} -> not is_absurdus(line)
      _ -> true
    end)

[21.387]

[21.524]

  defp get_files() do
    File.ls!(@priv_dir)

Deletion in lib/esperantisto/text/text.ex at line 57 [7.669]

B:BD[21.530] → [21.530:776]

B:BD[21.776] → [23.0:34]

B:BD[23.34] → [4.0:40]

∅:D[4.40] → [21.817:891]

B:BD[21.817] → [21.817:891]

B:BD[21.891] → [10.119:156]

B:BD[10.156] → [22.90:126]

∅:D[22.126] → [10.188:237]

B:BD[10.188] → [10.188:237]

B:BD[10.237] → [2.0:54]

∅:D[2.54] → [10.237:238]

∅:D[22.198] → [10.237:238]

B:BD[10.237] → [10.237:238]

B:BD[10.238] → [22.199:275]

∅:D[22.275] → [10.291:292]

B:BD[10.291] → [10.291:292]

B:BD[10.292] → [22.276:293]

B:BD[22.293] → [2.55:166]

∅:D[2.166] → [22.293:334]

B:BD[22.293] → [22.293:334]

∅:D[22.334] → [10.518:519]

B:BD[10.518] → [10.518:519]

B:BD[10.519] → [22.335:366]

B:BD[22.366] → [2.167:240]

∅:D[2.240] → [23.35:93]

B:BD[22.411] → [23.35:93]

∅:D[23.93] → [22.411:479]

B:BD[22.411] → [22.411:479]

∅:D[22.479] → [10.667:668]

B:BD[10.667] → [10.667:668]

B:BD[10.668] → [22.480:537]

B:BD[22.537] → [2.241:285]

∅:D[2.285] → [23.94:163]

B:BD[22.537] → [23.94:163]

B:BD[23.163] → [2.286:352]

∅:D[2.352] → [22.658:932]

B:BD[22.658] → [22.658:932]

∅:D[22.932] → [10.891:907]

B:BD[10.891] → [10.891:907]

B:BD[10.907] → [6.456:632]

∅:D[6.632] → [10.907:913]

B:BD[10.907] → [10.907:913]

B:BD[10.913] → [11.40:111]

B:BD[11.111] → [6.633:1987]

∅:D[6.1987] → [11.172:181]

B:BD[11.172] → [11.172:181]

B:BD[11.181] → [6.1988:4383]

∅:D[6.4383] → [11.181:187]

B:BD[11.181] → [11.181:187]


  defp is_absurdus("*       *       *       *       *"), do: true
  defp is_absurdus("*  *  *  *  *"), do: true
  defp is_absurdus(_), do: false
  defp remove_special({:line, line}) do
    new_line =
      line
      |> String.replace("_", "")
      |> String.replace("\"", "")
      |> String.replace(~r/\[.+\]/, "")
    {:line, new_line}
  end
  defp remove_special(skipped), do: skipped
  defp collect_sentences(stream) do
    stream
    |> Stream.transform(
      [],
      fn
        {:line, line}, acc ->
          splitted = String.split(line, ~r/\.|\?|\!/)
          case splitted do
            [p] ->
              {[], [p | acc]}
            _ ->
              punctuations =
                Regex.scan(~r/\.|\?|\!/, line)
                |> List.flatten()
              last = List.last(splitted)
              first_sentence =
                [List.first(splitted) <> List.first(punctuations) | acc]
                |> Stream.map(fn l -> String.trim(l) end)
                |> Enum.reverse()
                |> Enum.join(" ")
              other_sentences =
                splitted
                |> Stream.zip(punctuations)
                |> Stream.drop(1)
                |> Stream.drop(-1)
                |> Enum.map(fn {s, p} -> {:sentence, s <> p} end)
              new_acc =
                if last == "" do
                  []
                else
                  [last]
                end
              {[{:sentence, first_sentence}] ++ other_sentences, new_acc}
          end
        a, acc ->
          {[a], acc}
      end
    )
  end
  defp normalize_sentences(stream) do
    stream
    |> Stream.map(fn
      {:sentence, s} ->
        {:sentence, String.trim(s)}
      other ->
        other
    end)
  end
  defp filter_sentences(stream) do
    stream
    |> Stream.filter(fn
      {:sentence, s} ->
        Enum.all?(
          [
            &long_sentence?/1,
            &not_blacklisted?/1
          ],
          fn chk -> chk.(s) end
        )
      _ ->
        true
    end)
  end
  defp long_sentence?(sentence) do
    String.length(sentence) > 2
  end
  defp not_blacklisted?("(Rim.") do
    false
  end
  defp not_blacklisted?(_sentence) do
    true
  end
  def is_esperanto?(word) do
    check_esperanto(word)
  end
  defp check_esperanto(word) do
    Enum.any?(
      [
        &is_alias?/1,
        &is_table_word?/1,
        &is_in_vocab?/1,
        &is_conjuction?/1,
        &is_pronoun?/1,
        &is_preposition?/1,
        &is_noun?/1,
        &is_adjective?/1,
        &is_adverb?/1,
        &is_verb?/1
      ],
      fn f -> f.(word) end
    )
  end
  defp is_alias?({_, _}), do: true
  defp is_alias?(_), do: false
  defp is_noun?(word) do
    String.ends_with?(word, ["o", "oj", "on", "ojn"])
  end
  defp is_adjective?(word) do
    String.ends_with?(word, ["a", "aj", "an", "ajn"])
  end
  defp is_adverb?(word) do
    String.ends_with?(word, ["e", "en"])
  end
  defp is_pronoun?(word) do
    base = ~w(mi ni vi li ŝi ĝi ili oni si)
    Enum.any?(base, fn b ->
      word == b || word == b <> "a" || word == b <> "n" || word == b <> "aj" || word == b <> "an" ||
        word == b <> "ajn"
    end)
  end
  defp is_preposition?(word) do
    preps = ~w(
        al
        anstataŭ
        antaŭ
        apud
        ĉe
        ĉirkaŭ
        da
        de
        ekster
        el
        en
        ĝis
        inter
        je
        kontraŭ
        krom
        malantaŭ
        maltrans
        po
        post
        preter
        sub
        super
        sur
        tra
        trans
      )
    Enum.any?(preps, fn b -> word == b end)
  end
  defp is_table_word?(word) do
    tws = ~w(
        kio
        tio
        io
        ĉio
        nenio
        kion
        tion
        ion
        ĉion
        nenion
	
        kiu
        tiu
        iu
        ĉiu
        neniu
        kiun
        tiun
        iun
        ĉiun
        neniun
        kiuj
        tiuj
        iuj
        ĉiuj
        neniuj
        kiujn
        tiujn
        iujn
        ĉiujn
        neniujn
        kia
        tia
        ia
        ĉia
        nenia
        kian
        tian
        ian
        ĉian
        nenian
        kiaj
        tiaj
        iaj
        ĉiaj
        neniaj
        kiajn
        tiajn
        iajn
        ĉiajn
        neniajn
        kies
        ties
        ies
        ĉies
        nenies
        kie
        tie
        ie
        ĉie
        nenie
        kien
        tien
        ien
        ĉien
        nenien
        kiam
        tiam
        iam
        ĉiam
        neniam
        kial
        tial
        ial
        ĉial
        nenial
        kiel
        tiel
        iel
        ĉiel
        neniel
        kiom
	tiom
	iom
	ĉiom
	neniom
      )
    Enum.any?(tws, fn tw -> word == tw end)
  end
  defp is_conjuction?(word) do
    Enum.any?(
      ~w(
	  kaj
	  aŭ
	  sed
	  plus
	  minus
	  nek
	),
      fn conj -> conj == word end
    )
  end
  defp is_verb?(word) do
    String.ends_with?(word, ["i", "as", "is", "os", "us", "u"])
  end
  defp is_in_vocab?(word) do
    words = MapSet.new(~w(
          ajn
          almenaŭ
          ankaŭ
          apenaŭ
          do
          eĉ
          ja
          jen
          kvazaŭ
          mem
          nur
          pli
          plej
          preskaŭ
          tamen
          tre
          tro
          jes
          ne
          tuj
          por
          ĉar
          morgaŭ
          hieraŭ
	))
    MapSet.member?(words, word)
  end

Replacement in lib/esperantisto/saver.ex at line 7 [24.4590]
B:BD[24.4669] → [24.4669:4694]
```
    defstruct [:book_id]
```
[24.4669]
[24.4694]
```
    defstruct [:book_id, :prev_word]
```

Replacement in lib/esperantisto/saver.ex at line 10 [24.4590]

B:BD[24.4701] → [24.4701:4773]

  def start(stream) do
    stream
    |> Stream.scan(%State{}, &save/2)

[24.4701]

[24.4773]

  def start(book) do
    book_id = DB.create_book(book.title)
    DB.set_author(book_id, book.author)
    book.text
    |> Stream.scan(%State{book_id: book_id}, &save/2)

Replacement in lib/esperantisto/saver.ex at line 19 [24.4590]

B:BD[24.4800] → [24.4800:4914]

  defp save({:title, title}, acc) do
    book_id = DB.create_book(title)
    %State{acc | book_id: book_id}
  end

[24.4800]

[24.4914]

  defp save(word, %State{book_id: book_id, prev_word: prev_word} = acc) do
    DB.save_word(book_id, word)

Replacement in lib/esperantisto/saver.ex at line 22 [24.4590]

B:BD[24.4915] → [24.4915:5008]

B:BD[24.5008] → [6.4403:4428]

∅:D[6.4428] → [24.5008:5051]

B:BD[24.5008] → [24.5008:5051]

B:BD[24.5051] → [6.4429:4487]

∅:D[6.4487] → [13.315:316]

B:BD[13.315] → [13.315:316]

B:BD[13.316] → [6.4488:4678]

  defp save({:author, author}, acc) do
    DB.set_author(acc.book_id, author)
    acc
  end
  @desired_percentage 51
  defp save({:sentence, sentence}, acc) do
    words = Esperantisto.Text.sentence_to_words(sentence)
    check =
      words
      |> Enum.reduce({[], []}, fn word, {esp, not_esp} ->
        case Esperantisto.Text.is_esperanto?(word) do
          true ->
            {[word | esp], not_esp}

[24.4915]

[6.4678]

    if prev_word do
      DB.save_word_order(prev_word, word)
    end

Replacement in lib/esperantisto/saver.ex at line 26 [24.4590]

B:BD[6.4679] → [6.4679:4757]

∅:D[6.4757] → [11.335:336]

B:BD[11.335] → [11.335:336]

B:BD[11.336] → [6.4758:5976]

          false ->
            {esp, [word | not_esp]}
        end
      end)
    case check do
      {[], []} ->
        Logger.debug("Empty sentence: #{inspect(sentence)}")
        acc
      {esp, not_esp} ->
        len_esp = length(esp)
        len_not_esp = length(not_esp)
        total = len_esp + len_not_esp
        percentage = len_esp * 100 / total
        if percentage >= @desired_percentage do
          sentence_id = DB.save_sentence(acc.book_id, sentence)
          words
          |> Task.async_stream(
            fn word ->
              DB.save_word(sentence_id, word)
              word
            end,
            timeout: 30000,
            max_concurrency: 10
          )
          |> Stream.map(fn {:ok, v} -> v end)
          |> Stream.scan(
            {nil, nil},
            fn word, {_, prev} -> {prev, word} end
          )
          |> Task.async_stream(
            fn {prev, word} ->
              DB.save_word_order(prev, word)
            end,
            timeout: 30000,
            max_concurrency: 10
          )
          |> Stream.run()
          acc
        else
          Logger.warn(
            "Too many non esperanto words: #{len_not_esp}/#{total} (#{inspect(not_esp)}) in sentence #{sentence}"
          )
          acc
        end
    end

[6.4679]

[24.5103]

    %State{acc | prev_word: word}

Deletion in lib/esperantisto/db.ex at line 27 [24.5122]

B:BD[24.5648] → [24.5648:5697]

B:BD[24.5697] → [11.337:690]

∅:D[11.690] → [13.657:697]

B:BD[24.5994] → [13.657:697]

  end
  def save_sentence(book_id, sentence) do
    result =
      Bolt.Sips.query!(
        Bolt.Sips.conn(),
        """
        MATCH (book)
        WHERE id(book) = $id
        WITH book
        MERGE (book) -[:HAS_SENTENCE]-> (sentence:Sentence {text: $sentence})
        RETURN id(sentence) as sentence_id
        """,
        %{id: book_id, sentence: sentence},
        timeout: 25000
      )
    [id] = result["sentence_id"]
    id

Deletion in lib/esperantisto/db.ex at line 28 [24.5122]

B:BD[24.6000] → [13.698:699]

B:BD[13.699] → [15.194:304]


  def save_word(sentence_id, {word, als}) do
    save_word(sentence_id, word)
    save_word(sentence_id, als)

Insertion in lib/esperantisto/db.ex at line 29 [24.5122]
[15.305]
[15.305]
```
  def save_word(book_id, word) do
```

Replacement in lib/esperantisto/db.ex at line 33 [24.5122]

B:BD[15.361] → [15.361:588]

∅:D[15.588] → [13.699:867]

B:BD[13.699] → [13.699:867]

      MATCH (w1:Word {word: $word1})
      MATCH (w2:Word {word: $word2})
      MERGE (w1) -[:ALIAS_OF]-> (w2)
      MERGE (w1) <-[:ALIAS_OF]- (w2)
      """,
      %{word1: word, word2: als},
      timeout: 25000
    )
  end
  def save_word(sentence_id, word) do
    Bolt.Sips.query!(
      Bolt.Sips.conn(),
      """
      MATCH (sentence)
      WHERE id(sentence) = $id
      WITH sentence

[15.361]

[13.867]

      MATCH (book)
      WHERE id(book) = $id
      WITH book

Replacement in lib/esperantisto/db.ex at line 37 [24.5122]
B:BD[13.905] → [13.905:953]
```
      MERGE (word) -[:OF_SENTENCE]-> (sentence)
```
[13.905]
[13.953]
```
      MERGE (word) -[:OF_BOOK]-> (book)
```
Replacement in lib/esperantisto/db.ex at line 39 [24.5122]
B:BD[13.964] → [13.964:1002]
```
      %{id: sentence_id, word: word},
```
[13.964]
[13.1002]
```
      %{id: book_id, word: word},
```

Deletion in lib/esperantisto/db.ex at line 44 [24.5122]

B:BD[25.399] → [25.399:516]

∅:D[25.516] → [13.1029:1035]

B:BD[13.1029] → [13.1029:1035]

B:BD[13.1035] → [26.2626:2627]

B:BD[26.2627] → [25.517:786]

  def save_word_order({prev_1, prev_2}, next) do
    save_word_order(prev_1, next)
    save_word_order(prev_2, next)
  end
  def save_word_order(prev, {next_1, next_2}) do
    save_word_order(prev, next_1)
    save_word_order(prev, next_2)
  end
  def save_word_order({prev_1, prev_2}, {_next_1, _next_2} = next) do
    save_word_order(prev_1, next)
    save_word_order(prev_2, next)
  end