couchemar/esperantisto - Change NQ22ZIMD7U26TTQZPSUQBOGJRU3FYOTARB5OBWQHJUV6GWAYXSSAC

ŝanĝi linifinojn al posix

Created by couchemar on December 30, 2021

NQ22ZIMD7U26TTQZPSUQBOGJRU3FYOTARB5OBWQHJUV6GWAYXSSAC

Dependencies

In channels

main

Change contents

Replacement in R/esperantisto.Rmd at line 1 [4.55]

B:BD[4.55] → [4.56:1278]

B:BD[4.1278] → [5.175:192]

B:BD[5.192] → [2.123:279]

∅:D[2.279] → [5.313:344]

B:BD[5.313] → [5.313:344]

∅:D[5.344] → [4.1426:1431]

B:BD[4.1426] → [4.1426:1431]

B:BD[4.1431] → [6.0:284]

∅:D[6.284] → [4.1431:1433]

B:BD[4.1431] → [4.1431:1433]

B:BD[4.1433] → [6.285:323]

∅:D[6.323] → [4.1433:1435]

B:BD[4.1433] → [4.1433:1435]

B:BD[4.1435] → [6.324:729]

B:BD[6.729] → [3.0:41]

∅:D[3.41] → [6.741:746]

B:BD[6.741] → [6.741:746]

B:BD[6.746] → [3.42:97]

∅:D[3.97] → [6.799:804]

B:BD[6.799] → [6.799:804]

B:BD[6.804] → [3.98:100]

∅:D[3.100] → [7.0:167]

B:BD[6.804] → [7.0:167]

∅:D[7.167] → [6.804:806]

B:BD[6.804] → [6.804:806]

B:BD[6.806] → [7.168:729]

B:BD[7.729] → [8.122:266]

∅:D[8.266] → [7.849:888]

B:BD[7.849] → [7.849:888]

---
title: "Esperantisto"
output: html_notebook
---
```{r}
library(ghql)
library(dplyr)
library(jsonlite)
gql_conn <- GraphqlClient$new(url = 'http://localhost:4000')
words_query <- '
query wordEmbeddings {
  words {
    word
    fastrpEmbedding
  }
}
'
rq <- Query$new()$query('link', words_query)
result <- gql_conn$exec(rq$link) %>%
  fromJSON(flatten = F)
words <- result$data$words %>% as_tibble()
# XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado...
deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE)
```
```{r}
library(tidyr)
unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='')
```
```{r}
library(Rtsne)
library(tidyverse)
unnested_words_with_id <- unnested_words %>% mutate(ID=row_number())
set.seed(42)
tSNE_fit <- unnested_words_with_id %>%
  select(where(is.numeric)) %>%
  column_to_rownames("ID") %>%
  scale() %>%
  Rtsne()
```
```{r}
tSNE_df <- tSNE_fit$Y %>% 
  as.data.frame() %>%
  rename(tSNE1="V1",
         tSNE2="V2") %>%
  mutate(ID=row_number()) %>%
  inner_join(unnested_words_with_id, by="ID") %>%
  select(-ID)
```
```{r}
library(plotly)
set.seed(42)
g <- tSNE_df %>% sample_n(2000) %>% ggplot(aes(x=tSNE1, y=tSNE2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g, tooltip = 'word')
```
```{r}
library(lsa)
nearest <- function(df, word) {
  wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res[-1,]
}
deduped_words %>% nearest("mi")
```
```{r}
analogue <- function(df, a, b, c) {
  embA <- unlist(words[which(words$word==a),]$fastrpEmbedding)
  embB <- unlist(words[which(words$word==b),]$fastrpEmbedding)
  embC <- unlist(words[which(words$word==c),]$fastrpEmbedding)
  
  embD <- embA - embB + embC
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res %>% filter(!(word %in% c(a,b,c)))
}
deduped_words %>% analogue("reĝo", "viro", "virino")
```
```{r}
library(Rdimtools)
library(tidytext)
library(scales)
words_emb_matrix = as.matrix(unnested_words[,2:32])
pca_res <- do.pca(words_emb_matrix, ndim=16)
res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble()
res$word = deduped_words$word
res %>%
  pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>%
  group_by(pca) %>%
  top_n(12, abs(value)) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, value, pca)) %>%
  ggplot(aes(word, value, fill=pca)) + 
  geom_col(alpha=0.8, show.legend = FALSE) +
  facet_wrap(~pca, scales = "free_y", ncol = 4) + 
  scale_x_reordered() +
  coord_flip() +
  labs(
    x = NULL,
    y = "Value"
  )
```
```{r}
set.seed(42)
g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g2, tooltip = 'word')
```

[4.55]

---
title: "Esperantisto"
output: html_notebook
---
```{r}
library(ghql)
library(dplyr)
library(jsonlite)
gql_conn <- GraphqlClient$new(url = 'http://localhost:4000')
words_query <- '
query wordEmbeddings {
  words {
    word
    fastrpEmbedding
  }
}
'
rq <- Query$new()$query('link', words_query)
result <- gql_conn$exec(rq$link) %>%
  fromJSON(flatten = F)
words <- result$data$words %>% as_tibble()
# XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado...
deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE)
```
```{r}
library(tidyr)
unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='')
```
```{r}
library(Rtsne)
library(tidyverse)
unnested_words_with_id <- unnested_words %>% mutate(ID=row_number())
set.seed(42)
tSNE_fit <- unnested_words_with_id %>%
  select(where(is.numeric)) %>%
  column_to_rownames("ID") %>%
  scale() %>%
  Rtsne()
```
```{r}
tSNE_df <- tSNE_fit$Y %>% 
  as.data.frame() %>%
  rename(tSNE1="V1",
         tSNE2="V2") %>%
  mutate(ID=row_number()) %>%
  inner_join(unnested_words_with_id, by="ID") %>%
  select(-ID)
```
```{r}
library(plotly)
set.seed(42)
g <- tSNE_df %>% sample_n(2000) %>% ggplot(aes(x=tSNE1, y=tSNE2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g, tooltip = 'word')
```
```{r}
library(lsa)
nearest <- function(df, word) {
  wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res[-1,]
}
deduped_words %>% nearest("mi")
```
```{r}
analogue <- function(df, a, b, c) {
  embA <- unlist(words[which(words$word==a),]$fastrpEmbedding)
  embB <- unlist(words[which(words$word==b),]$fastrpEmbedding)
  embC <- unlist(words[which(words$word==c),]$fastrpEmbedding)
  
  embD <- embA - embB + embC
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res %>% filter(!(word %in% c(a,b,c)))
}
deduped_words %>% analogue("reĝo", "viro", "virino")
```
```{r}
library(Rdimtools)
library(tidytext)
library(scales)
words_emb_matrix = as.matrix(unnested_words[,2:32])
pca_res <- do.pca(words_emb_matrix, ndim=16)
res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble()
res$word = deduped_words$word
res %>%
  pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>%
  group_by(pca) %>%
  top_n(12, abs(value)) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, value, pca)) %>%
  ggplot(aes(word, value, fill=pca)) + 
  geom_col(alpha=0.8, show.legend = FALSE) +
  facet_wrap(~pca, scales = "free_y", ncol = 4) + 
  scale_x_reordered() +
  coord_flip() +
  labs(
    x = NULL,
    y = "Value"
  )
```
```{r}
set.seed(42)
g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g2, tooltip = 'word')
```