r - Function or other basic script that compares values on two variables in a dataframe using an id variable located in both -


let's have 2 data frames, both of contain some, not of same records. same records, id variable in both data frames matches. there particular variable in each data frame needs checked consistency across data frames, , discrepancies need printed:

d1 <- ## first dataframe d2 <- ## second dataframe  colnames(d1) #column headings dataframe 1 [1] "id" "variable1" "variable2" "variable3"  colnames(d2) #column headings dataframe 2 identical [1] "id" "variable1" "variable2" "variable3"  length(d1$id) #there 200 records in dataframe 1 [1] 200  length(d2$id) #there not same number in dataframe 2 [1] 150  ##some function takes d1$id, matches d2$id, compares values of matched, returning discrepancies 

i constructed elaborate loop this, feel though not right way of going it. surely there better way for-if-for-if-if statement.

for (i in seq(d1$id)){ ##sets counter loop   if (d1$id[i] %in% d2$id){ ## search, compares , saves common id , variable     index <- d1$id[i];     variable_d1 <- d1$variable1[i];     (p in seq(d2$id)){ set       if (d2$id[p] == index){ ## saves corresponding value in second dataframe         variable_d2 <- d2$variable1[p];           if (variable_d2 != variable_d1) { ## prints if not equal             print(index);           }       }     }   } } 

here's solution, using random input data 50% chance given cell discrepant between d1 , d2:

set.seed(1); d1 <- data.frame(id=sample(300,200),variable1=sample(2,200,replace=t),variable2=sample(2,200,replace=t),variable3=sample(2,200,replace=t)); d2 <- data.frame(id=sample(300,150),variable1=sample(2,150,replace=t),variable2=sample(2,150,replace=t),variable3=sample(2,150,replace=t)); head(d1); ##    id variable1 variable2 variable3 ## 1  80         1         2         2 ## 2 112         1         1         2 ## 3 171         2         2         1 ## 4 270         1         2         2 ## 5  60         1         2         2 ## 6 266         2         2         2 head(d2); ##    id variable1 variable2 variable3 ## 1 258         1         2         1 ## 2  11         1         1         1 ## 3 290         2         1         2 ## 4 222         2         1         2 ## 5  81         2         1         1 ## 6 200         1         2         1 com <- intersect(d1$id,d2$id); ## derive common id values d1com <- match(com,d1$id); ## find indexes of d1 correspond common id values, in order of com d2com <- match(com,d2$id); ## find indexes of d2 correspond common id values, in order of com v1diff <- com[d1$variable1[d1com]!=d2$variable1[d2com]]; ## ids of variable1 discrepancies v1diff; ##  [1]  60 278  18 219 290  35 107   4 237 131  50 210  29 168   6 174  61 127  99 220 247 244 157  51  84 122 196 125 265 115 186 139   3 132 223 211 268 102 155 207 238  41 199 200 231 236 172 275 250 176 248 255 222  59 100  33 124 v2diff <- com[d1$variable2[d1com]!=d2$variable2[d2com]]; ## ids of variable2 discrepancies v2diff; ##  [1] 112  60  18 198 219 290 131  50 210  29 168 258 215 291 127 161  99 220 110 293  87 164  84 122 196 125 186 139  81 132  82  89 223 268  98  14 155 241 207 231 172  62 275 176 248 255  59 298 100  12 156 v3diff <- com[d1$variable3[d1com]!=d2$variable3[d2com]]; ## ids of variable3 discrepancies v3diff; ##  [1] 278 219 290  35   4 237 131 168 202 174 215 220 247 244 261 293 164  13 294  84 196 125 265 115 186  81   3  89 223 211 268  98  14 155 241 207  38 191 200 276 250  45 269 255 298 100  12 156 124 

here's proof variable1 values ids in v1diff discrepant between d1 , d2:

d1$variable1[match(v1diff,d1$id)]; d2$variable1[match(v1diff,d2$id)]; ##  [1] 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1 ##  [1] 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 1 1 2 1 1 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 1 2 2 

here's proof variable1 values ids not in v1diff not discrepant between d1 , d2:

with(subset(d1,id%in%com&!id%in%v1diff),variable1[order(id)]); with(subset(d2,id%in%com&!id%in%v1diff),variable1[order(id)]); ##  [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1 ##  [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1 

here, wrapped solution in function returns vectors of discrepant id values in list, each component named variable represents:

compare <- function(d1,d2,cols=setdiff(intersect(colnames(d1),colnames(d2)),'id')) {     com <- intersect(d1$id,d2$id);     d1com <- match(com,d1$id);     d2com <- match(com,d2$id);     setnames(lapply(cols,function(col) com[d1[[col]][d1com]!=d2[[col]][d2com]]),cols); }; compare(d1,d2); ## $variable1 ##  [1]  60 278  18 219 290  35 107   4 237 131  50 210  29 168   6 174  61 127  99 220 247 244 157  51  84 122 196 125 265 115 186 139   3 132 223 211 268 102 155 207 238  41 199 200 231 236 172 275 250 176 248 255 222  59 100  33 124 ## ## $variable2 ##  [1] 112  60  18 198 219 290 131  50 210  29 168 258 215 291 127 161  99 220 110 293  87 164  84 122 196 125 186 139  81 132  82  89 223 268  98  14 155 241 207 231 172  62 275 176 248 255  59 298 100  12 156 ## ## $variable3 ##  [1] 278 219 290  35   4 237 131 168 202 174 215 220 247 244 261 293 164  13 294  84 196 125 265 115 186  81   3  89 223 211 268  98  14 155 241 207  38 191 200 276 250  45 269 255 298 100  12 156 124 

Comments

Popular posts from this blog

python - TypeError: start must be a integer -

c# - DevExpress RepositoryItemComboBox BackColor property ignored -

django - Creating multiple model instances in DRF3 -