如何解决如何确定R中的LARGE data.table的两列中最长的公共子字符串
我修改了这个问题的答案:Find length of overlap in strings,但是由于迭代速度较慢,因此在应用于大数据时存在问题。
如何改善下面的功能,以查找两个字符串中任意位置(不管大小写)之间两个字符串之间最长的公共重叠?
strlcs <- function(str1,str2,type="lcs") {
if(nchar(str1) < nchar(str2)) {
x <- str2
str2 <- str1
str1 <- x
}
x <- strsplit(str2,"")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s,cumsum(s == 1L))
s <- rep(list(s),n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]],function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s,recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]],collapse = "")
check <- grepl(lcs,str1,fixed = TRUE)
if(check) {
if(type=="nchar"){
return(nchar(lcs))
}else{
return(lcs)
}
break
} else {
i <- i + 1L
}
}
}
样本数据:
library(data.table)
sampdata <- data.frame(
str1=c("Doug Olivas","GRANT MANAGEMENT LLC","LUNA VAN DERESH","wendy t marzardo","AMIN NYGUEN COMPANY LLC","GERARDO CONTRaraS","miguel martinez","albert marks porter"),str2=c("doug olivas","miguel grant","LUNA VAN DERESH MANAGEMENT LLC","marzardo","amin nyguen llc","gerardo contraras","miggy martinez","albert"),stringsAsFactors = F
)
###Create sample big data from prevIoUs sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)
上述功能并未针对大数据进行优化。
如何在不到目前残酷的20多秒的时间内完成以下工作?
DESIRED FUNCTION APPLIED ON BIG DATA:
system.time(bigsampdata$desired_LCSnchar <- sapply(c(1:nrow(bigsampdata)),FUN=function(x){strlcs(tolower(bigsampdata$str1[x]),tolower(bigsampdata$str2[x]),type="lcs")}))
user system elapsed
24.290 0.008 24.313
解决方法
我发现使用LCS
包中的qualV
函数可以找到更快的解决方案:
library(data.table)
library(qualV)
strlcs_op <- function(str1,str2) {
v1 <- unlist(strsplit(str1,""))
v2 <- unlist(strsplit(str2,""))
return(paste(v1[LCS(v1,v2)$va],collapse = ""))
}
# same as yours but with data.table syntax
system.time(bigsampdata[,desired_LCSnchar := mapply(strlcs,tolower(str1),tolower(str2))])
#> user system elapsed
#> 41.64 0.04 42.20
# optimised function
system.time(bigsampdata[,desired_LCSnchar := mapply(strlcs_op,tolower(str2))])
#> user system elapsed
#> 4.58 0.00 4.75
您可以通过将mapply
与mcmapply
并行化来进一步提高速度
我使用c++
在Rcpp
中实现了Wikipedia的解决方案pseudocode。
library(Rcpp)
cppFunction('
String largeset_common_substring(String str1,String str2)
{
std::string S = str1;
std::string T = str2;
int r = S.length();
int n = T.length();
std::vector<std::vector<int> > L(r,std::vector<int>(n));
int z = 0;
std::string ret;
for (int i = 0; i < r; ++i)
{
for (int j = 0; j < n; ++j)
{
if (S[i] == T[j])
{
if (i == 0 || j == 0)
L[i][j] = 1;
else
L[i][j] = L[i - 1][j - 1] + 1;
if (L[i][j] > z)
{
z = L[i][j];
ret = S.substr(i - z + 1,z);
}
}
else
{
L[i][j] = 0;
}
}
}
return ret;
}
')
largeset_common_substring(tolower("GRANT MANAGEMENT LLC"),"miguel grant")
#> [1] "grant"
这是您的大型数据集的时间。
library(data.table)
sampdata <- data.frame(
str1=c("Doug Olivas","GRANT MANAGEMENT LLC","LUNA VAN DERESH","wendy t marzardo","AMIN NYGUEN COMPANY LLC","GERARDO CONTRARAS","miguel martinez","albert marks porter"),str2=c("doug olivas","miguel grant","LUNA VAN DERESH MANAGEMENT LLC","marzardo","amin nyguen llc","gerardo contraras","miggy martinez","albert"),stringsAsFactors = F
)
###Create sample big data from previous sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)
system.time(
bigsampdata[,desired_LCSnchar := purrr::map2_chr(
tolower(bigsampdata$str1),tolower(bigsampdata$str2),largeset_common_substring
)]
)
#> user system elapsed
#> 0.78 0.07 1.28
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。