Stack Overflow Asked on November 4, 2021
I have two data frames as follows:
df<-data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2<-data.frame(
identifer=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productFF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
I want to join these two datasets first on the following variables + create a new column called “match” that describes the join:
1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"
Then, I want to join the REMAINING rows on these variables:
2)identifier, key, product, DF
match="identifier,key, product,DF"
Then the remaining rows from step 2 on these variables, so and so forth.
3) identifier, key, Ingredient, DF
4) identifier, key, DF
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient
9) identifier, key
And I want to return the rows that do not have a match as well. I know how to do this stepwise but I’m wondering if there is an easier way to do this?
this is the expected output:
df_out<-data.frame(
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product_1=c("productA","productB","productC","productD","productE","productF"),
ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))
Here is an option using data.table
:
library(data.table)
setDT(df)
setDT(df_2)
keyord <- list(
c("product", "ingredient", "DF"),
c("product", "DF"),
c("ingredient", "DF"),
"DF",
c("product", "ingredient"),
"product",
"ingredient",
c()
)
cols <- c("product", "ingredient", "DF", "Route")
df[, Match := NA_character_]
for (v in keyord) {
k <- c("identifier", "key", v)
df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))]
df[is.na(Match) & check, Match := toString(k)]
}
setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE)
output:
id identifier key product_1 ingredient_1 DF_1 Match product_2 ingredient_2 DF_2 Route_2 check
1: 1-1 1 A productA ingredientA Tablet identifier, key, product, ingredient, DF productA ingredientA Tablet ORAL TRUE
2: 2-2 2 B productB ingredientB Powder identifier, key, product, ingredient, DF productB ingredientB Powder INHALATION TRUE
3: 3-3 3 C productC ingredientC Suspension identifier, key, ingredient, DF productCC ingredientC Suspension topical TRUE
4: 4-4 4 D productD ingredientD System identifier, key productDD ingredientDD injection injecatable TRUE
5: 5-5 5 E productE ingredientE Capsule <NA> <NA> <NA> <NA> <NA> NA
6: 6-6 6 F productF ingredientF Capsule identifier, key, product, ingredient productF ingredientF tablet oral TRUE
data after fixing some typos in OP:
df <- data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifier=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2 <- data.frame(
identifier=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
edit for multiple matches:
df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") )
setDT(df_2)
df[, c("Match", "check") := .(NA_character_, FALSE)]
ocols <- unique(unlist(keyord))
rbindlist(lapply(keyord, function(v) {
k <- c("identifier", "key", v)
a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id),
setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")),
setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2")))
]
df[id %chin% a$id, check := TRUE]
a
}), use.names=TRUE)
output:
id product_1 ingredient_1 DF_1 product_2 ingredient_2 DF_2 Route_2
1: 1-1 productA ingredientA Tablet productA ingredientA Tablet ORAL
2: 2-2 productB ingredientB Powder productB ingredientB Powder INHALATION
3: 3-3 productC ingredientC Suspension productCC ingredientC Suspension topical
4: 6-6 productF ingredientF Capsule productF ingredientF tablet oral
5: 4-4 productD ingredientD System productDD ingredientDD injection injecatable
6: 4-4 productD ingredientD System productDd <NA> <NA> <NA>
Answered by chinsoon12 on November 4, 2021
Here is a solution that might feel slightly over-engineered but achieves the expected outcome:
library(dplyr)
library(purrr)
library(stringr)
get_match=function(data, cols, keys){
rtn = ifelse(rowSums(is.na(data[paste0(cols, "_1")]))==rowSums(is.na(data[paste0(cols, "_2")])), paste(keys, collapse="+"), "None")
rtn2 = cols %>%
map(~{
case_when(as.character(data[[paste0(.x, "_1")]])==as.character(data[[paste0(.x, "_2")]])~.x)
}) %>%
reduce(paste, sep="+") %>% str_replace_all("\+?NA\+?", "")
paste(rtn, rtn2, sep="+") %>% str_replace_all("\+$", "")
}
df_out = left_join(df, df_2, by=c("identifer", "key"), suffix=c("_1", "_2")) %>%
mutate(Match = get_match(., cols=c("product", "ingredient", "DF"), keys=c("identifer", "key")),
match_strength = str_count(Match, "\+")) %>%
group_by(id) %>%
filter(match_strength==max(match_strength, na.rm=TRUE))
dplyr::left_join
removes the by
keys so the only way I found to add them is to check that all the _1
or the _2
were missing. I could have used the keep=TRUE
option and remove/rename them hereafter though...
Answered by Dan Chaltiel on November 4, 2021
Get help from others!
Recent Questions
Recent Answers
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP