couchemar/esperantisto: R/esperantisto.Rmd

---
title: "Esperantisto"
output: html_notebook
---

```{r}
library(ghql)
library(dplyr)
library(jsonlite)

gql_conn <- GraphqlClient$new(url = 'http://localhost:4000')

words_query <- '
query wordEmbeddings($limit: Int, $offset: Int) {
  words(options: {limit: $limit, offset: $offset}) {
    word
    community
    fastrpEmbedding
  }
}
'

rq <- Query$new()$query('link', words_query)

end <- FALSE
words <- NULL

limit <- 10000
offset <- 0

while (!end) {
  result <- gql_conn$exec(rq$link, variables=list(limit=limit, offset=offset)) %>%
    fromJSON(flatten = F)
  w <- result$data$words %>% as_tibble()
  words <- rbind(words, w)
  offset <- offset + limit
  end <- nrow(w) == 0
}

# XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado...
deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE)
```
```{r}
deduped_words %>% group_by(community) %>% tally(sort=TRUE) %>% filter(n <= 10)
```
```{r}
deduped_words %>% group_by(community) %>% filter(n() <= 2) %>% ungroup() %>% select(word)
```


```{r}
library(tidyr)
unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='')
```

```{r}
library(Rtsne)
library(tidyverse)

unnested_words_with_id <- unnested_words %>% mutate(ID=row_number())

set.seed(42)
tSNE_fit <- unnested_words_with_id %>%
  select(-community) %>%
  select(where(is.numeric)) %>%
  column_to_rownames("ID") %>%
  scale() %>%
  Rtsne()
```

```{r}
tSNE_df <- tSNE_fit$Y %>% 
  as.data.frame() %>%
  rename(tSNE1="V1",
         tSNE2="V2") %>%
  mutate(ID=row_number()) %>%
  inner_join(unnested_words_with_id, by="ID") %>%
  select(-ID)
```

```{r}
library(plotly)
set.seed(42)

g <- tSNE_df %>%
  sample_n(2000) %>%
  ggplot(aes(x=tSNE1, y=tSNE2, label=word, color=community)) +
  geom_text(aes(label=word), hjust=0, vjust=0)
ggplotly(g, tooltip = 'word')
```

```{r}
library(lsa)

nearest <- function(df, word) {
  wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res[-1,]
}

deduped_words %>% nearest("mi")
```

```{r}
analogue <- function(df, a, b, c) {
  embA <- unlist(words[which(words$word==a),]$fastrpEmbedding)
  embB <- unlist(words[which(words$word==b),]$fastrpEmbedding)
  embC <- unlist(words[which(words$word==c),]$fastrpEmbedding)
  
  embD <- embA - embB + embC
  
  res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
  res %>% filter(!(word %in% c(a,b,c)))
}

deduped_words %>% analogue("reĝo", "viro", "virino")
```

```{r}
library(Rdimtools)
library(tidytext)
library(scales)

words_emb_matrix = unnested_words %>% select(starts_with('frp')) %>% as.matrix()

pca_res <- do.pca(words_emb_matrix, ndim=16)

res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble()
res$word = deduped_words$word
res %>%
  pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>%
  group_by(pca) %>%
  top_n(12, abs(value)) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, value, pca)) %>%
  ggplot(aes(word, value, fill=pca)) + 
  geom_col(alpha=0.8, show.legend = FALSE) +
  facet_wrap(~pca, scales = "free_y", ncol = 4) + 
  scale_x_reordered() +
  coord_flip() +
  labs(
    x = NULL,
    y = "Value"
  )
```
```{r}
set.seed(42)
g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g2, tooltip = 'word')
```