Title: | Byte Pair Encoding Text Tokenization |
---|---|
Description: | Unsupervised text tokenizer focused on computational efficiency. Wraps the 'YouTokenToMe' library <https://github.com/VKCOM/YouTokenToMe> which is an implementation of fast Byte Pair Encoding (BPE) <https://aclanthology.org/P16-1162/>. |
Authors: | Jan Wijffels [aut, cre, cph] (R wrapper), BNOSAC [cph] (R wrapper), VK.com [cph], Gregory Popovitch [ctb, cph] (Files at src/parallel_hashmap (Apache License, Version 2.0), The Abseil Authors [ctb, cph] (Files at src/parallel_hashmap (Apache License, Version 2.0), Ivan Belonogov [ctb, cph] (Files at src/youtokentome (MIT License)) |
Maintainer: | Jan Wijffels <[email protected]> |
License: | MPL-2.0 |
Version: | 0.1.3 |
Built: | 2024-11-10 04:29:07 UTC |
Source: | https://github.com/bnosac/tokenizers.bpe |
Dataset from 2017 with Questions asked by members of the Belgian Federal Parliament.
The dataset was extracted from http://data.dekamer.be and contains questions asked by persons in the Belgium Federal parliament.
The questions are translated in Dutch and French.
The dataset contains the following information:
doc_id: an identifier
text: the question itself
language: the language of the text
http://data.dekamer.be, data is provided by http://www.dekamer.be in the public domain (CC0).
data(belgium_parliament) str(belgium_parliament)
data(belgium_parliament) str(belgium_parliament)
Construct a Byte Pair Encoding model on text
bpe( x, coverage = 0.9999, vocab_size = 5000, threads = -1L, pad_id = 0L, unk_id = 1L, bos_id = 2L, eos_id = 3L, model_path = file.path(getwd(), "youtokentome.bpe") )
bpe( x, coverage = 0.9999, vocab_size = 5000, threads = -1L, pad_id = 0L, unk_id = 1L, bos_id = 2L, eos_id = 3L, model_path = file.path(getwd(), "youtokentome.bpe") )
x |
path to the text file containing training data or a character vector of text with training data |
coverage |
fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999 |
vocab_size |
integer indicating the number of tokens in the final vocabulary |
threads |
integer with number of CPU threads to use for model processing. If equal to -1 then minimum of the number of available threads and 8 will be used |
pad_id |
integer, reserved id for padding |
unk_id |
integer, reserved id for unknown symbols |
bos_id |
integer, reserved id for begin of sentence token |
eos_id |
integer, reserved id for end of sentence token |
model_path |
path to the file on disk where the model will be stored. Defaults to 'youtokentome.bpe' in the current working directory |
an object of class youtokentome
which is defined at bpe_load_model
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
Decode a sequence of Byte Pair Encoding ids into text again
bpe_decode(model, x, ...)
bpe_decode(model, x, ...)
model |
an object of class |
x |
an integer vector of BPE id's |
... |
further arguments passed on to youtokentome_encode_as_ids |
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
Tokenise text alongside a Byte Pair Encoding model
bpe_encode( model, x, type = c("subwords", "ids"), bos = FALSE, eos = FALSE, reverse = FALSE )
bpe_encode( model, x, type = c("subwords", "ids"), bos = FALSE, eos = FALSE, reverse = FALSE )
model |
an object of class |
x |
a character vector of text to tokenise |
type |
a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. Defaults to 'subwords'. |
bos |
logical if set to TRUE then token 'beginning of sentence' will be added |
eos |
logical if set to TRUE then token 'end of sentence' will be added |
reverse |
logical if set to TRUE the output sequence of tokens will be reversed |
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model str(model$vocabulary) text <- c("L'appartement est grand & vraiment bien situe en plein centre", "Proportion de femmes dans les situations de famille monoparentale.") bpe_encode(model, x = text, type = "subwords") bpe_encode(model, x = text, type = "ids") encoded <- bpe_encode(model, x = text, type = "ids") decoded <- bpe_decode(model, encoded) decoded ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
Load a Byte Pair Encoding model trained with bpe
bpe_load_model(file, threads = -1L)
bpe_load_model(file, threads = -1L)
file |
path to the model |
threads |
integer with number of CPU threads to use for model processing. If equal to -1 then minimum of the number of available threads and 8 will be used |
an object of class youtokentome
which is a list with elements
model: an Rcpp pointer to the model
model_path: the path to the model
threads: the threads argument
vocab_size: the size of the BPE vocabulary
vocabulary: the BPE vocabulary with is a data.frame with columns id and subword
## Reload a model path <- system.file(package = "tokenizers.bpe", "extdata", "youtokentome.bpe") model <- bpe_load_model(path) ## Build a model and load it again data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model <- bpe_load_model(model$model_path, threads = 1) ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)
## Reload a model path <- system.file(package = "tokenizers.bpe", "extdata", "youtokentome.bpe") model <- bpe_load_model(path) ## Build a model and load it again data(belgium_parliament, package = "tokenizers.bpe") x <- subset(belgium_parliament, language == "french") model <- bpe(x$text, coverage = 0.999, vocab_size = 5000, threads = 1) model <- bpe_load_model(model$model_path, threads = 1) ## Remove the model file (Clean up for CRAN) file.remove(model$model_path)