--- title: "Esperantisto" output: html_notebook --- ```{r} library(ghql) library(dplyr) library(jsonlite) gql_conn <- GraphqlClient$new(url = 'http://localhost:4000') words_query <- ' query wordEmbeddings($limit: Int, $offset: Int) { words(options: {limit: $limit, offset: $offset}) { word community fastrpEmbedding } } ' rq <- Query$new()$query('link', words_query) end <- FALSE words <- NULL limit <- 10000 offset <- 0 while (!end) { result <- gql_conn$exec(rq$link, variables=list(limit=limit, offset=offset)) %>% fromJSON(flatten = F) w <- result$data$words %>% as_tibble() words <- rbind(words, w) offset <- offset + limit end <- nrow(w) == 0 } # XXX: Ni havas duobligo! Ŝajnas, ke ni bezonas pliigi dimensiojn dum enkonstruado... deduped_words <- words %>% distinct(fastrpEmbedding, .keep_all = TRUE) ``` ```{r} deduped_words %>% group_by(community) %>% tally(sort=TRUE) %>% filter(n <= 10) ``` ```{r} deduped_words %>% group_by(community) %>% filter(n() <= 2) %>% ungroup() %>% select(word) ``` ```{r} library(tidyr) unnested_words <- deduped_words %>% rename(frp=fastrpEmbedding) %>% unnest_wider(frp, names_sep='') ``` ```{r} library(Rtsne) library(tidyverse) unnested_words_with_id <- unnested_words %>% mutate(ID=row_number()) set.seed(42) tSNE_fit <- unnested_words_with_id %>% select(-community) %>% select(where(is.numeric)) %>% column_to_rownames("ID") %>% scale() %>% Rtsne() ``` ```{r} tSNE_df <- tSNE_fit$Y %>% as.data.frame() %>% rename(tSNE1="V1", tSNE2="V2") %>% mutate(ID=row_number()) %>% inner_join(unnested_words_with_id, by="ID") %>% select(-ID) ``` ```{r} library(plotly) set.seed(42) g <- tSNE_df %>% sample_n(2000) %>% ggplot(aes(x=tSNE1, y=tSNE2, label=word, color=community)) + geom_text(aes(label=word), hjust=0, vjust=0) ggplotly(g, tooltip = 'word') ``` ```{r} library(lsa) nearest <- function(df, word) { wEmbedding <- df[which(df$word==word),]$fastrpEmbedding %>% unlist res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), wEmbedding)) %>% arrange(-sim) %>% select(-fastrpEmbedding) res[-1,] } deduped_words %>% nearest("mi") ``` ```{r} analogue <- function(df, a, b, c) { embA <- unlist(words[which(words$word==a),]$fastrpEmbedding) embB <- unlist(words[which(words$word==b),]$fastrpEmbedding) embC <- unlist(words[which(words$word==c),]$fastrpEmbedding) embD <- embA - embB + embC res <- df %>% rowwise() %>% mutate(sim = cosine(unlist(fastrpEmbedding), embD)) %>% arrange(-sim) %>% select(-fastrpEmbedding) res %>% filter(!(word %in% c(a,b,c))) } deduped_words %>% analogue("reĝo", "viro", "virino") ``` ```{r} library(Rdimtools) library(tidytext) library(scales) words_emb_matrix = unnested_words %>% select(starts_with('frp')) %>% as.matrix() pca_res <- do.pca(words_emb_matrix, ndim=16) res = pca_res$Y %>% rescale(to=c(-1, 1)) %>% as_tibble() res$word = deduped_words$word res %>% pivot_longer(!word, names_prefix="V", names_to="pca", names_transform =list(pca=as.integer)) %>% group_by(pca) %>% top_n(12, abs(value)) %>% ungroup() %>% mutate(word = reorder_within(word, value, pca)) %>% ggplot(aes(word, value, fill=pca)) + geom_col(alpha=0.8, show.legend = FALSE) + facet_wrap(~pca, scales = "free_y", ncol = 4) + scale_x_reordered() + coord_flip() + labs( x = NULL, y = "Value" ) ``` ```{r} set.seed(42) g2 <- res %>% sample_n(200) %>% ggplot(aes(x=V1, y=V2, label=word)) + geom_text(aes(label=word), hjust=0, vjust=0, color="blue") ggplotly(g2, tooltip = 'word') ```