Skip to content

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields.  #102

@ernestaigner

Description

@ernestaigner

Hi,

This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).

best,
Ernest

## function that creates keywords and adapts annotation table and overwrites old information
fun.kw.short <- function(at,UPOS=c("NOUN", "NOUN"),LRF=50,LR=1.5) {
  # Calculate the Rake. Keep the full information of the annotation table. This is based on udpipe::keyword_rake()
  library(udpipe)
  library(stringr)
  ##  create one common ID on the level of sentences.
  at[,group:=unique_identifier(at, fields = c("doc_id","paragraph_id","sentence_id"))]
  ## copy lemma to 'word' to merge lemma to keywords. 
  at[,word:=lemma]
  
  # Rake 
  ## code relevant terms
  at[,.relevant:=upos %in% UPOS]
  ## Add Keyword_id
  at[,keyword_id:=data.table::rleid(group,.relevant)]
  ## Calculate degree of each keyword (number of words minues 1).
  at[.relevant != FALSE,degree:=.N-1L,keyword_id]
  ## create keyowrds
  at[.relevant != FALSE,keyword:=paste(word, collapse = " "),keyword_id]
  ## Calculate degree of each word (sum of degrees of a word).
  at[.relevant != FALSE,word_degree:=sum(degree),word]
  ## Calculate frequency of a word
  at[.relevant != FALSE,word_freq:=.N,word]
  ## Calculate rake of a word
  at[.relevant != FALSE,word_rake:=word_degree/word_freq]
  # Keywords
  ## Number of times the keyword is used. 
  at[.relevant != FALSE, keyword_freq:=length(unique(keyword_id)), .(keyword, word)]
  ## Ngram
  at[.relevant != FALSE, keyword_ngram:=length(unique(word)), .(keyword)]
  ## Rake score of keywords (sum of rake of the words).
  at[.relevant != FALSE, keyword_rake:=sum(word_rake),keyword_id]
  
  ##replace lemma and add ngram of lemma
  at[,lemma:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword,lemma)]
  at[,lemma_ng:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword_ngram,1)]
  
  # Remove added variables
  at[,(c("keyword","keyword_ngram","keyword_freq","keyword_rake",".relevant","word_freq","word_rake","word_degree","degree","keyword_id","group","word")):=NULL]
  
  # merge head_token_ids variables
  ## create new token_id
  setorder(at,doc_id,paragraph_id,sentence_id,token_id)
  at[,token_id2:=data.table::rleid(lemma),.(doc_id,paragraph_id,sentence_id)]
  ## recode id cols
  id_cols <- c("doc_id","paragraph_id","sentence_id","token_id","head_token_id","token_id2")
  fun.ext.int <- function (x) {x+10^max(nchar(x))}
  at[,(id_cols):=lapply(.SD,fun.ext.int),.SDcols=id_cols]
  at[,(id_cols):=lapply(.SD,as.numeric),.SDcols=id_cols]
  ## create new id cols
  at[,id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id))]
  at[,id2:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id2))]
  at[,head_id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,head_token_id))]
  
  # create new head ids
  at[,head_id2:=at[,.(id1,id2)][,unique(.SD)][match(at$head_id1,id1),id2]]
  ## add dependency relation to root
  at[dep_rel=="root",head_id2:=head_id1]
  ## create new head_token_id
  at[,head_token_id2:=head_id2 %% 10000]
  at[,head_token_id2:=ifelse(head_token_id2==token_id2,NA,head_token_id2)]
  at[,c("token_id","head_id1","head_id2","id1","id2","head_token_id"):=NULL]
  setnames(at,gsub("id2","id",names(at)))
 
  # merge entries in upos, xpos, feats, dep_rel, token and misc. 
  cols <- c("doc_id","paragraph_id","sentence_id","token_id")
  cols1 <- c("upos","xpos","feats","dep_rel","token","misc")
  at[,deps:=NULL] ##column not needed, when used, as.character might be needed
  #at[,(cols1):=lapply(.SD,as.character),.SDcols=cols1]
  my.fun <- function(x) paste0(x[!is.na(x)&!duplicated(x)],collapse="_")
  at[,(cols1):=lapply(.SD,my.fun),by=cols,.SDcols=cols1]
  
  # export
  at[,unique(.SD)]  
  
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions