B:BD[
4.1278] → [
5.175:192]
B:BD[
5.192] → [
2.123:279]
B:BD[
5.313] → [
5.313:344]
∅:D[
5.344] → [
4.1426:1431]
B:BD[
4.1426] → [
4.1426:1431]
∅:D[
6.284] → [
4.1431:1433]
B:BD[
4.1431] → [
4.1431:1433]
B:BD[
4.1433] → [
6.285:323]
∅:D[
6.323] → [
4.1433:1435]
B:BD[
4.1433] → [
4.1433:1435]
B:BD[
4.1435] → [
6.324:729]
B:BD[
6.741] → [
6.741:746]
B:BD[
6.799] → [
6.799:804]
B:BD[
6.804] → [
6.804:806]
B:BD[
6.806] → [
7.168:729]
B:BD[
7.729] → [
8.122:266]
B:BD[
7.849] → [
7.849:888]
---
title: "Esperantisto"
output: html_notebook
---
```{r}
library(ghql)
library(dplyr)
library(jsonlite)
gql_conn <- GraphqlClient$new(url = 'http://localhost:4000')
words_query <- '
query wordEmbeddings {
words {
word
fastrpEmbedding
}
}
'
rq <- Query$new()$query('link', words_query)
result <- gql_conn$exec(rq$link) %>%
fromJSON(flatten = F)
words <- result$data$words %>% as_tibble()
# XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado...
deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE)
```
```{r}
library(tidyr)
unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='')
```
```{r}
library(Rtsne)
library(tidyverse)
unnested_words_with_id <- unnested_words %>% mutate(ID=row_number())
set.seed(42)
tSNE_fit <- unnested_words_with_id %>%
select(where(is.numeric)) %>%
column_to_rownames("ID") %>%
scale() %>%
Rtsne()
```
```{r}
tSNE_df <- tSNE_fit$Y %>%
as.data.frame() %>%
rename(tSNE1="V1",
tSNE2="V2") %>%
mutate(ID=row_number()) %>%
inner_join(unnested_words_with_id, by="ID") %>%
select(-ID)
```
```{r}
library(plotly)
set.seed(42)
g <- tSNE_df %>% sample_n(2000) %>% ggplot(aes(x=tSNE1, y=tSNE2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g, tooltip = 'word')
```
```{r}
library(lsa)
nearest <- function(df, word) {
wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist
res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
res[-1,]
}
deduped_words %>% nearest("mi")
```
```{r}
analogue <- function(df, a, b, c) {
embA <- unlist(words[which(words$word==a),]$fastrpEmbedding)
embB <- unlist(words[which(words$word==b),]$fastrpEmbedding)
embC <- unlist(words[which(words$word==c),]$fastrpEmbedding)
embD <- embA - embB + embC
res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
res %>% filter(!(word %in% c(a,b,c)))
}
deduped_words %>% analogue("reĝo", "viro", "virino")
```
```{r}
library(Rdimtools)
library(tidytext)
library(scales)
words_emb_matrix = as.matrix(unnested_words[,2:32])
pca_res <- do.pca(words_emb_matrix, ndim=16)
res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble()
res$word = deduped_words$word
res %>%
pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>%
group_by(pca) %>%
top_n(12, abs(value)) %>%
ungroup() %>%
mutate(word = reorder_within(word, value, pca)) %>%
ggplot(aes(word, value, fill=pca)) +
geom_col(alpha=0.8, show.legend = FALSE) +
facet_wrap(~pca, scales = "free_y", ncol = 4) +
scale_x_reordered() +
coord_flip() +
labs(
x = NULL,
y = "Value"
)
```
```{r}
set.seed(42)
g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g2, tooltip = 'word')
```
---
title: "Esperantisto"
output: html_notebook
---
```{r}
library(ghql)
library(dplyr)
library(jsonlite)
gql_conn <- GraphqlClient$new(url = 'http://localhost:4000')
words_query <- '
query wordEmbeddings {
words {
word
fastrpEmbedding
}
}
'
rq <- Query$new()$query('link', words_query)
result <- gql_conn$exec(rq$link) %>%
fromJSON(flatten = F)
words <- result$data$words %>% as_tibble()
# XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado...
deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE)
```
```{r}
library(tidyr)
unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='')
```
```{r}
library(Rtsne)
library(tidyverse)
unnested_words_with_id <- unnested_words %>% mutate(ID=row_number())
set.seed(42)
tSNE_fit <- unnested_words_with_id %>%
select(where(is.numeric)) %>%
column_to_rownames("ID") %>%
scale() %>%
Rtsne()
```
```{r}
tSNE_df <- tSNE_fit$Y %>%
as.data.frame() %>%
rename(tSNE1="V1",
tSNE2="V2") %>%
mutate(ID=row_number()) %>%
inner_join(unnested_words_with_id, by="ID") %>%
select(-ID)
```
```{r}
library(plotly)
set.seed(42)
g <- tSNE_df %>% sample_n(2000) %>% ggplot(aes(x=tSNE1, y=tSNE2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g, tooltip = 'word')
```
```{r}
library(lsa)
nearest <- function(df, word) {
wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist
res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
res[-1,]
}
deduped_words %>% nearest("mi")
```
```{r}
analogue <- function(df, a, b, c) {
embA <- unlist(words[which(words$word==a),]$fastrpEmbedding)
embB <- unlist(words[which(words$word==b),]$fastrpEmbedding)
embC <- unlist(words[which(words$word==c),]$fastrpEmbedding)
embD <- embA - embB + embC
res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding)
res %>% filter(!(word %in% c(a,b,c)))
}
deduped_words %>% analogue("reĝo", "viro", "virino")
```
```{r}
library(Rdimtools)
library(tidytext)
library(scales)
words_emb_matrix = as.matrix(unnested_words[,2:32])
pca_res <- do.pca(words_emb_matrix, ndim=16)
res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble()
res$word = deduped_words$word
res %>%
pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>%
group_by(pca) %>%
top_n(12, abs(value)) %>%
ungroup() %>%
mutate(word = reorder_within(word, value, pca)) %>%
ggplot(aes(word, value, fill=pca)) +
geom_col(alpha=0.8, show.legend = FALSE) +
facet_wrap(~pca, scales = "free_y", ncol = 4) +
scale_x_reordered() +
coord_flip() +
labs(
x = NULL,
y = "Value"
)
```
```{r}
set.seed(42)
g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue")
ggplotly(g2, tooltip = 'word')
```