From: "Gemt af Internet Explorer 11" Subject: Georgia - LFS and donor imputation Date: Wed, 30 Dec 2020 10:34:57 +0100 MIME-Version: 1.0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: file://C:\Temp\wz7fe2\Exploration.html X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 =EF=BB=BF
=20 =20 =20 =library(tidyverse)=0A=
library(simputation)
lfs_demo <- haven::read_spss("SPSS - =
Eng-2019/LFS_Demograpics.sav")=0A=
=0A=
lfs_ecstat <- haven::read_spss("SPSS - Eng-2019/LFS_ECSTAT.sav")=0A=
=0A=
lfs_family <- haven::read_spss("SPSS - Eng-2019/LFS_Familysize.sav")=0A=
=0A=
lfs_id <- haven::read_spss("SPSS - Eng-2019/LFS_ID.sav")=0A=
=0A=
lfs_weights <- haven::read_spss("SPSS - =
Eng-2019/LFS_Personal_weights.sav")
No. of observations in each data set:
ls(pattern =3D "lfs_.*") %>% =0A=
set_names(.) %>% =0A=
map(get) %>% =0A=
map_int(nrow)
## lfs_demo lfs_ecstat lfs_family lfs_id =
lfs_weights =0A=
## 77005 61386 21501 21501 =
61386
Demographics has more observations than weights and ecstat. Item non=20 response? (household member level)
Calculate simple response rate:
nrow(lfs_weights)/nrow(lfs_demo)
## [1] 0.797169
Construct data for analysis:
lfs <- lfs_id %>% =0A=
left_join(lfs_weights, by =3D "UID") %>% =0A=
left_join(lfs_ecstat, by =3D c("UID", "MemberNo")) %>% =0A=
rename_with(str_to_lower) %>% =0A=
select(year, month, quarterno, uid, memberno, region, urban_rural, =
p_weights, sex, age,=0A=
active, active_soft, employed:not_identified, =
starts_with("b"))
Snapshot of data:
lfs %>% print(n_extra =3D 0)
## # A tibble: 61,386 x 41=0A=
## year month quarterno uid memberno region urban_rural =
p_weights sex=0A=
## <dbl> <dbl> <chr> <dbl> <dbl> =
<dbl+lb> <dbl+lbl> <dbl> <dbl+l>=0A=
## 1 2019 3 91 45405 1 29 [Kak~ 1 [Urban] =
106. 2 [Man]=0A=
## 2 2019 3 91 45405 2 29 [Kak~ 1 [Urban] =
107. 1 [Wom~=0A=
## 3 2019 9 93 45406 1 29 [Kak~ 2 [Rural] =
183. 1 [Wom~=0A=
## 4 2019 1 91 45407 1 29 [Kak~ 2 [Rural] =
234. 1 [Wom~=0A=
## 5 2019 5 92 45408 2 11 [Tbi~ 1 [Urban] =
824. 1 [Wom~=0A=
## 6 2019 12 94 45409 1 11 [Tbi~ 1 [Urban] =
424. 1 [Wom~=0A=
## 7 2019 12 94 45409 2 11 [Tbi~ 1 [Urban] =
443. 2 [Man]=0A=
## 8 2019 12 94 45410 2 11 [Tbi~ 1 [Urban] =
335. 1 [Wom~=0A=
## 9 2019 12 94 45410 1 11 [Tbi~ 1 [Urban] =
349. 2 [Man]=0A=
## 10 2019 12 94 45410 4 11 [Tbi~ 1 [Urban] =
335. 1 [Wom~=0A=
## # ... with 61,376 more rows, and 32 more variables
Which quarters are included:
lfs %>% count(year,quarterno)
## # A tibble: 4 x 3=0A=
## year quarterno n=0A=
## <dbl> <chr> <int>=0A=
## 1 2019 91 15308=0A=
## 2 2019 92 15555=0A=
## 3 2019 93 15446=0A=
## 4 2019 94 15077
Check total population for 4th quarter:
lfs %>% =0A=
group_by(quarterno) %>% =0A=
summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 4 x 3=0A=
## quarterno n N=0A=
## <chr> <int> <dbl>=0A=
## 1 91 15308 3029168.=0A=
## 2 92 15555 3043425.=0A=
## 3 93 15446 3044084.=0A=
## 4 94 15077 3031826.
lfs %>% =0A=
filter(quarterno =3D=3D '94') %>% =0A=
group_by(quarterno, sex) %>% =0A=
summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 2 x 4=0A=
## quarterno sex n N=0A=
## <chr> <dbl+lbl> <int> <dbl>=0A=
## 1 94 1 [Woman] 7876 1607467.=0A=
## 2 94 2 [Man] 7201 1424359.
lfs %>% =0A=
filter(quarterno =3D=3D '94') %>% =0A=
group_by(quarterno, region) %>% =0A=
summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 11 x 4=0A=
## quarterno region n N=0A=
## <chr> <dbl+lbl> =
<int> <dbl>=0A=
## 1 94 11 [Tbilisi] 1915 937861.=0A=
## 2 94 15 [Adjara A/R] 1714 268115.=0A=
## 3 94 23 [Guria] 1028 98022.=0A=
## 4 94 26 [Imereti] 1465 430964.=0A=
## 5 94 29 [Kakheti] 1469 241949.=0A=
## 6 94 32 [Mtskheta-Mtianeti] 857 80163.=0A=
## 7 94 35 [Racha-Lechkhumi and Kvemo Svaneti] 808 26989.=0A=
## 8 94 38 [Samegrelo and Zemo Svaneti] 1568 260699.=0A=
## 9 94 41 [Samtskhe-Javakheti] 1164 134792.=0A=
## 10 94 44 [Kvemo Kartli] 1453 331493.=0A=
## 11 94 47 [Shida Kartli] 1636 =
220777.
lfs %>% =0A=
filter(quarterno =3D=3D '94') %>% =0A=
group_by(quarterno, employed) %>% =0A=
summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 2 x 4=0A=
## quarterno employed n N=0A=
## <chr> <dbl+lbl> <int> <dbl>=0A=
## 1 94 0 [No] 5978 1365316.=0A=
## 2 94 1 [Yes] 9099 1666509.
First I calculate the overall response matrix R:
R <- lfs %>% =0A=
select(-year,-month) %>% =0A=
mutate(across(.cols =3D -c(1:6), .fns =3D negate(is.na))) %>% =0A=
mutate(across(.cols =3D -c(1:6), .fns =3D as.numeric))=0A=
=0A=
R %>% select(-c(1:6)) %>% print(n_extra =3D 0)
## # A tibble: 61,386 x 33=0A=
## sex age active active_soft employed hired self_employed =
not_identified=0A=
## <dbl> <dbl> <dbl> <dbl> =
<dbl> <dbl> <dbl> <dbl>=0A=
## 1 1 1 1 1 1 1 1 =
1=0A=
## 2 1 1 1 1 1 1 1 =
1=0A=
## 3 1 1 1 1 1 1 1 =
1=0A=
## 4 1 1 1 1 1 1 1 =
1=0A=
## 5 1 1 1 1 1 1 1 =
1=0A=
## 6 1 1 1 1 1 1 1 =
1=0A=
## 7 1 1 1 1 1 1 1 =
1=0A=
## 8 1 1 1 1 1 1 1 =
1=0A=
## 9 1 1 1 1 1 1 1 =
1=0A=
## 10 1 1 1 1 1 1 1 =
1=0A=
## # ... with 61,376 more rows, and 25 more variables
Can also calculate per question:
R_j <- R %>% =0A=
summarise(across(.cols =3D -c(1:6), .fns =3D mean)) %>% =0A=
pivot_longer(cols =3D everything(), names_to =3D "variable", values_to =
=3D "R_mean")=0A=
=0A=
R_j
## # A tibble: 33 x 2=0A=
## variable R_mean=0A=
## <chr> <dbl>=0A=
## 1 sex 1 =0A=
## 2 age 1 =0A=
## 3 active 1 =0A=
## 4 active_soft 1 =0A=
## 5 employed 1 =0A=
## 6 hired 1 =0A=
## 7 self_employed 1 =0A=
## 8 not_identified 1 =0A=
## 9 b4_nace_1 0.608=0A=
## 10 brunch_converted_1 0.608=0A=
## # ... with 23 more rows
R_j %>% arrange(R_mean)
## # A tibble: 33 x 2=0A=
## variable R_mean=0A=
## <chr> <dbl>=0A=
## 1 b26_97_type_of_transport 0.00453=0A=
## 2 b26_5_type_of_transport 0.00681=0A=
## 3 b19_number_of_employees_in_establishment 0.00826=0A=
## 4 b26_3_type_of_transport 0.0172 =0A=
## 5 b10_work_duration 0.0202 =0A=
## 6 b26_2_type_of_transport 0.0291 =0A=
## 7 b16_persons_under_supervision 0.0310 =0A=
## 8 b26_4_type_of_transport 0.0966 =0A=
## 9 b26_1_type_of_transport 0.0972 =0A=
## 10 b8_permanency_of_the_job 0.259 =0A=
## # ... with 23 more rows
Clear signs of routing I=E2=80=99m not catching right now! Look at = the interviews=20 instead:
R_i <- R %>% =0A=
group_by(across(1:6)) %>% =0A=
pivot_longer(cols =3D -group_cols(), names_to =3D "variable", =
values_to =3D "R") %>% =0A=
summarise(R_mean =3D mean(R), .groups =3D "drop")=0A=
=0A=
R_i
## # A tibble: 61,386 x 7=0A=
## quarterno uid memberno region urban_rural =
p_weights R_mean=0A=
## <chr> <dbl> <dbl> =
<dbl+lbl> <dbl+lbl> <dbl> <dbl>=0A=
## 1 91 45405 1 29 [Kakheti] 1 [Urban] =
106. 0.242=0A=
## 2 91 45405 2 29 [Kakheti] 1 [Urban] =
107. 0.242=0A=
## 3 91 45407 1 29 [Kakheti] 2 [Rural] =
234. 0.606=0A=
## 4 91 45419 1 38 [Samegrelo and Zemo~ 1 [Urban] =
302. 0.242=0A=
## 5 91 45419 2 38 [Samegrelo and Zemo~ 1 [Urban] =
302. 0.242=0A=
## 6 91 45433 1 29 [Kakheti] 1 [Urban] =
92.4 0.606=0A=
## 7 91 45433 2 29 [Kakheti] 1 [Urban] =
93.6 0.242=0A=
## 8 91 45433 3 29 [Kakheti] 1 [Urban] =
92.5 0.242=0A=
## 9 91 45434 1 29 [Kakheti] 2 [Rural] =
192. 0.242=0A=
## 10 91 45434 2 29 [Kakheti] 2 [Rural] =
190. 0.242=0A=
## # ... with 61,376 more rows
R_i %>% arrange(desc(R_mean))
## # A tibble: 61,386 x 7=0A=
## quarterno uid memberno region urban_rural =
p_weights R_mean=0A=
## <chr> <dbl> <dbl> =
<dbl+lbl> <dbl+lbl> <dbl> <dbl>=0A=
## 1 93 65081 2 23 [Guria] 2 [Rural] =
95.9 0.818=0A=
## 2 94 48046 6 38 [Samegrelo and Zemo~ 1 [Urban] =
126. 0.818=0A=
## 3 91 48092 1 35 [Racha-Lechkhumi an~ 2 [Rural] =
28.6 0.788=0A=
## 4 91 48686 2 47 [Shida Kartli] 2 [Rural] =
123. 0.788=0A=
## 5 91 49075 4 38 [Samegrelo and Zemo~ 2 [Rural] =
157. 0.788=0A=
## 6 91 49208 1 47 [Shida Kartli] 2 [Rural] =
153. 0.788=0A=
## 7 91 50640 1 38 [Samegrelo and Zemo~ 2 [Rural] =
257. 0.788=0A=
## 8 91 51787 1 47 [Shida Kartli] 1 [Urban] =
204. 0.788=0A=
## 9 91 52480 3 11 [Tbilisi] 1 [Urban] =
257. 0.788=0A=
## 10 91 53122 1 38 [Samegrelo and Zemo~ 2 [Rural] =
126. 0.788=0A=
## # ... with 61,376 more rows
What about the NACE levels:
R %>% select(uid, memberno, employed, =
matches("nace"))
## # A tibble: 61,386 x 5=0A=
## uid memberno employed b4_nace_1 b4_nace_2=0A=
## <dbl> <dbl> <dbl> <dbl> =
<dbl>=0A=
## 1 45405 1 1 0 0=0A=
## 2 45405 2 1 0 0=0A=
## 3 45406 1 1 0 0=0A=
## 4 45407 1 1 1 1=0A=
## 5 45408 2 1 0 0=0A=
## 6 45409 1 1 0 0=0A=
## 7 45409 2 1 1 1=0A=
## 8 45410 2 1 1 1=0A=
## 9 45410 1 1 1 1=0A=
## 10 45410 4 1 1 1=0A=
## # ... with 61,376 more rows
lfs %>% count(employed, =
is.na(b4_nace_1))
## # A tibble: 3 x 3=0A=
## employed `is.na(b4_nace_1)` n=0A=
## <dbl+lbl> <lgl> <int>=0A=
## 1 0 [No] TRUE 24060=0A=
## 2 1 [Yes] FALSE 37324=0A=
## 3 1 [Yes] TRUE 2
lfs %>% count(employed, =
is.na(b4_nace_2))
## # A tibble: 3 x 3=0A=
## employed `is.na(b4_nace_2)` n=0A=
## <dbl+lbl> <lgl> <int>=0A=
## 1 0 [No] TRUE 24060=0A=
## 2 1 [Yes] FALSE 37324=0A=
## 3 1 [Yes] TRUE 2
Only 2 non-response for NACE 2 in the entire = year!
lfs %>% =0A=
filter(employed =3D=3D 1, is.na(b4_nace_1)) %>% =0A=
select(quarterno, uid, region, p_weights, sex, age, =
matches("nace"))
## # A tibble: 2 x 8=0A=
## quarterno uid region p_weights sex age b4_nace_1 =
b4_nace_2=0A=
## <chr> <dbl> <dbl+lbl> <dbl> =
<dbl+lbl> <dbl> <dbl> <dbl>=0A=
## 1 92 50041 11 [Tbilisi] 840. 2 [Man] 29 NA =
NA=0A=
## 2 91 50294 11 [Tbilisi] 868. 2 [Man] 29 NA =
NA
lfs_empl <- lfs %>% filter(employed =3D=3D =
1)
Special interest in the following variables:
lfs_empl %>% =
summarise(across(hired:not_identified, sum))
## # A tibble: 1 x 3=0A=
## hired self_employed not_identified=0A=
## <dbl> <dbl> <dbl>=0A=
## 1 15864 21453 9
lfs_empl %>% =
count(is.na(b4_nace_1))
## # A tibble: 2 x 2=0A=
## `is.na(b4_nace_1)` n=0A=
## <lgl> <int>=0A=
## 1 FALSE 37324=0A=
## 2 TRUE 2
lfs_empl %>% =
count(is.na(b5_occupation))
## # A tibble: 2 x 2=0A=
## `is.na(b5_occupation)` n=0A=
## <lgl> <int>=0A=
## 1 FALSE 37320=0A=
## 2 TRUE 6
lfs_empl %>% count(b6_sector)
## # A tibble: 4 x 2=0A=
## b6_sector n=0A=
## <dbl+lbl> <int>=0A=
## 1 1 [State ownership] 6341=0A=
## 2 2 [Private ownership] 30871=0A=
## 3 97 [Other type] 106=0A=
## 4 98 [Do not know] 8
lfs_empl %>% count(b7_status)
## # A tibble: 6 x 2=0A=
## b7_status n=0A=
## <dbl+lbl> <int>=0A=
## 1 1 [Employee] 15864=0A=
## 2 2 [Employer] 507=0A=
## 3 3 [Own account worker] 12583=0A=
## 4 4 [Members of producers=E2=80=99 cooperatives] 9=0A=
## 5 5 [Unpaid family worker] 8354=0A=
## 6 97 9
lfs_empl %>% =
count(b8_permanency_of_the_job)
## # A tibble: 6 x 2=0A=
## b8_permanency_of_the_job n=0A=
## <dbl+lbl> <int>=0A=
## 1 1 [Permanent] 14629=0A=
## 2 2 [Temporary] 822=0A=
## 3 3 [Seasonal] 325=0A=
## 4 4 [Odd (Casual)] 51=0A=
## 5 98 [Do not know] 46=0A=
## 6 NA 21453
lfs_empl %>% =
count(b17_b18_net_earnings)
## # A tibble: 11 x 2=0A=
## b17_b18_net_earnings n=0A=
## <dbl+lbl> <int>=0A=
## 1 1 [100 GEL or less] 295=0A=
## 2 2 [101-200 GEL] 1487=0A=
## 3 3 [201-400 GEL] 4195=0A=
## 4 4 [401-600 GEL] 3500=0A=
## 5 5 [601-800 GEL] 2166=0A=
## 6 6 [801-1000 GEL] 1413=0A=
## 7 7 [1001-1500 GEL] 892=0A=
## 8 8 [1501-2000 GEL] 233=0A=
## 9 9 [More than 2000 GEL] 137=0A=
## 10 88 [Refuse the answer] 1555=0A=
## 11 NA 21453
lfs_empl %>% count(b21_workplace)
## # A tibble: 14 x 2=0A=
## b21_workplace =
n=0A=
## =
<dbl+lbl> <int>=0A=
## 1 1 [At his/her household dwelling] =
230=0A=
## 2 2 [Client's place] =
1269=0A=
## 3 3 [Formal office] =
9047=0A=
## 4 4 [Factory/atelier/workshop, etc. instead of your living place] =
2245=0A=
## 5 5 [Plantations/farm/garden/field] =
17558=0A=
## 6 6 [Construction sites] =
928=0A=
## 7 7 [Mines/quarry] =
100=0A=
## 8 8 [Shop /kiosk/coffee house/restaurant/hotel] =
2491=0A=
## 9 9 [Different places (mobile)] =
2512=0A=
## 10 10 [Fixed, street or market stall] =
494=0A=
## 11 11 [Pond/lake/river /see] =
151=0A=
## 12 12 [Forest] =
30=0A=
## 13 97 [Other] =
268=0A=
## 14 NA =
3
lfs_empl %>% =
count(b22__employed_at_local_unit)
## # A tibble: 7 x 2=0A=
## b22__employed_at_local_unit n=0A=
## <dbl+lbl> <int>=0A=
## 1 1 [1-10 persons] 20544=0A=
## 2 2 [11-19 persons] 2597=0A=
## 3 3 [20-49 persons] 3654=0A=
## 4 4 [50 or more] 4147=0A=
## 5 98 [Do not know] 1528=0A=
## 6 99 [Not applicable] 4855=0A=
## 7 NA 1
Conclusion: Additional coded non-response that could also be=20 imputed.
Before imputation all coded non-response should be=20 handled!
lfs_empl_2 <- lfs_empl %>% =0A=
haven::zap_labels() %>%=0A=
transmute(=0A=
quarterno, uid, memberno, p_weights,=0A=
region, urban_rural, sex, age,=0A=
hired =3D hired + not_identified, self_employed,=0A=
b4_nace_1 =3D str_pad(string =3D b4_nace_1, width =3D 4, side =3D =
"l", pad =3D "0") %>% =0A=
str_sub(1,1), =0A=
b5_occupation =3D str_pad(string =3D b5_occupation, width =3D 4, =
side =3D "l", pad =3D "0") %>%=0A=
str_sub(1,1),=0A=
b6_sector =3D case_when(b6_sector =3D=3D 97 ~ 2, b6_sector =3D=3D 98 =
~ NA_real_, TRUE ~ b6_sector),=0A=
b7_status =3D case_when(b7_status =3D=3D 97 ~ NA_real_, TRUE ~ =
b7_status),=0A=
b8_permanency_of_the_job =3D case_when(=0A=
self_employed =3D=3D 1 ~ 0,=0A=
b8_permanency_of_the_job %in% 3:4 ~ 2,=0A=
b8_permanency_of_the_job =3D=3D 98 ~ NA_real_,=0A=
TRUE ~ b8_permanency_of_the_job=0A=
),=0A=
b17_b18_net_earnings =3D case_when(=0A=
self_employed =3D=3D 1 ~ 0,=0A=
b17_b18_net_earnings =3D=3D 88 ~ NA_real_,=0A=
TRUE ~ b17_b18_net_earnings=0A=
),=0A=
b22__employed_at_local_unit =3D case_when(=0A=
b22__employed_at_local_unit %in% 98:99 ~ NA_real_,=0A=
TRUE ~ b22__employed_at_local_unit=0A=
)=0A=
) %>% =0A=
mutate(across(.cols =3D -c(quarterno, uid, memberno, age, p_weights), =
.fns =3D as.factor))
summary(lfs_empl_2)
## quarterno uid memberno =
p_weights =0A=
## Length:37326 Min. :45407 Min. : 1.000 Min. : 9.37 =0A=
## Class :character 1st Qu.:50964 1st Qu.: 1.000 1st Qu.: 106.19 =0A=
## Mode :character Median :56262 Median : 2.000 Median : 145.76 =0A=
## Mean :56229 Mean : 2.168 Mean : 181.13 =0A=
## 3rd Qu.:61532 3rd Qu.: 3.000 3rd Qu.: 212.90 =0A=
## Max. :66905 Max. :15.000 Max. :1867.20 =0A=
## =0A=
## region urban_rural sex age hired =
self_employed=0A=
## 15 : 4345 1:12552 1:17572 Min. :15.00 0:21453 =
0:15873 =0A=
## 29 : 3969 2:24774 2:19754 1st Qu.:35.00 1:15873 =
1:21453 =0A=
## 38 : 3943 Median :49.00 =
=0A=
## 26 : 3903 Mean :48.15 =
=0A=
## 47 : 3669 3rd Qu.:60.00 =
=0A=
## 44 : 3422 Max. :99.00 =
=0A=
## (Other):14075 =
=0A=
## b4_nace_1 b5_occupation b6_sector b7_status =0A=
## 0 :17740 6 :17320 1 : 6341 1 :15864 =0A=
## 5 : 4457 2 : 3625 2 :30977 2 : 507 =0A=
## 8 : 4205 5 : 3608 NA's: 8 3 :12583 =0A=
## 4 : 2661 7 : 2804 4 : 9 =0A=
## 7 : 2468 9 : 2504 5 : 8354 =0A=
## (Other): 5793 (Other): 7459 NA's: 9 =0A=
## NA's : 2 NA's : 6 =0A=
## b8_permanency_of_the_job b17_b18_net_earnings =
b22__employed_at_local_unit=0A=
## 0 :21453 0 :21453 1 :20544 =
=0A=
## 1 :14629 3 : 4195 2 : 2597 =
=0A=
## 2 : 1198 4 : 3500 3 : 3654 =
=0A=
## NA's: 46 5 : 2166 4 : 4147 =
=0A=
## 2 : 1487 NA's: 6384 =
=0A=
## (Other): 2970 =
=0A=
## NA's : 1555
set.seed(2020)=0A=
=0A=
lfs_complete <- lfs_empl_2 %>% =0A=
filter(across(.fns =3D negate(is.na)))=0A=
=0A=
lfs_test <- lfs_complete %>% slice_sample(prop =3D 1/3) %>% =
mutate(b4_nace_1 =3D NA)=0A=
lfs_train <- lfs_complete %>% anti_join(lfs_test, by =3D =
c("quarterno","uid","memberno"))=0A=
=0A=
lfs_incomplete <- bind_rows(lfs_test, lfs_train)
set.seed(2020)=0A=
=0A=
imput_knn1 <- lfs_incomplete %>%=0A=
as.data.frame() %>% =0A=
impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
b17_b18_net_earnings,=0A=
k =3D 1)=0A=
=0A=
x <- imput_knn1 %>% =0A=
transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
by =3D c("quarterno","uid","memberno")) %>% =0A=
as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
## quarterno uid memberno b4_nace_1_imput b4_nace_1=0A=
## <chr> <dbl> <dbl> <fct> =
<fct> =0A=
## 1 94 52585 4 0 0 =0A=
## 2 93 64097 4 6 8 =0A=
## 3 93 51214 4 0 1 =0A=
## 4 91 52048 3 0 0 =0A=
## 5 92 48718 3 8 8 =0A=
## 6 91 51716 2 8 8 =0A=
## 7 93 64038 2 0 0 =0A=
## 8 91 45560 4 5 5 =0A=
## 9 92 51300 2 8 7 =0A=
## 10 93 57957 3 0 0 =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6858501
set.seed(2020)=0A=
=0A=
imput_knn5 <- lfs_incomplete %>%=0A=
as.data.frame() %>% =0A=
impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
b17_b18_net_earnings,=0A=
k =3D 5)=0A=
=0A=
x <- imput_knn5 %>% =0A=
transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
by =3D c("quarterno","uid","memberno")) %>% =0A=
as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
## quarterno uid memberno b4_nace_1_imput b4_nace_1=0A=
## <chr> <dbl> <dbl> <fct> =
<fct> =0A=
## 1 94 52585 4 5 0 =0A=
## 2 93 64097 4 6 8 =0A=
## 3 93 51214 4 6 1 =0A=
## 4 91 52048 3 0 0 =0A=
## 5 92 48718 3 8 8 =0A=
## 6 91 51716 2 8 8 =0A=
## 7 93 64038 2 0 0 =0A=
## 8 91 45560 4 5 5 =0A=
## 9 92 51300 2 8 7 =0A=
## 10 93 57957 3 0 0 =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6385152
set.seed(2020)=0A=
=0A=
imput_nn <- lfs_incomplete %>% =0A=
mutate(b4_nace_1 =3D as.numeric(as.character(b4_nace_1))) %>% =0A=
impute_pmm(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
b17_b18_net_earnings) %>% =0A=
mutate(b4_nace_1 =3D as.factor(b4_nace_1))
## Warning in predict.lm(m, newdat =3D dat[ina, , drop =3D =
FALSE]): prediction from a=0A=
## rank-deficient fit may be misleading
x <- imput_nn %>% =0A=
transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
by =3D c("quarterno","uid","memberno"))=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
## quarterno uid memberno b4_nace_1_imput b4_nace_1=0A=
## <chr> <dbl> <dbl> <fct> =
<fct> =0A=
## 1 94 52585 4 1 0 =0A=
## 2 93 64097 4 6 8 =0A=
## 3 93 51214 4 1 1 =0A=
## 4 91 52048 3 1 0 =0A=
## 5 92 48718 3 7 8 =0A=
## 6 91 51716 2 8 8 =0A=
## 7 93 64038 2 0 0 =0A=
## 8 91 45560 4 6 5 =0A=
## 9 92 51300 2 8 7 =0A=
## 10 93 57957 3 0 0 =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.3646202
set.seed(2020)=0A=
=0A=
imput_mf <- lfs_incomplete %>%=0A=
as.data.frame() %>% =0A=
impute_mf(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
b17_b18_net_earnings)
## missForest iteration 1 in progress...done!=0A=
## missForest iteration 2 in progress...done!=0A=
## missForest iteration 3 in progress...done!
x <- imput_mf %>% =0A=
transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
by =3D c("quarterno","uid","memberno")) %>% =0A=
as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
## quarterno uid memberno b4_nace_1_imput b4_nace_1=0A=
## <chr> <dbl> <dbl> <fct> =
<fct> =0A=
## 1 94 52585 4 0 0 =0A=
## 2 93 64097 4 6 8 =0A=
## 3 93 51214 4 0 1 =0A=
## 4 91 52048 3 0 0 =0A=
## 5 92 48718 3 8 8 =0A=
## 6 91 51716 2 8 8 =0A=
## 7 93 64038 2 0 0 =0A=
## 8 91 45560 4 3 5 =0A=
## 9 92 51300 2 8 7 =0A=
## 10 93 57957 3 0 0 =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6629918
** Which one is best? I go further with knn(1) donor = imputation**
Do the same imputation on the ls_empl_2
data:
lfs_empl_3 <- lfs_empl_2 %>%=0A=
mutate(b4_nace_1_old =3D b4_nace_1) %>% =0A=
as.data.frame() %>% =0A=
impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
b17_b18_net_earnings,=0A=
k =3D 1) %>% =0A=
as_tibble()=0A=
=0A=
lfs_empl_3 %>% =0A=
group_by(b4_nace_1) %>% =0A=
summarise(n =3D n(), n_old =3D n() - sum(is.na(b4_nace_1_old)),=0A=
N =3D sum(p_weights), N_old =3D =
sum(as.numeric(!is.na(b4_nace_1_old))*p_weights))
## `summarise()` ungrouping output (override with `.groups` =
argument)
## # A tibble: 10 x 5=0A=
## b4_nace_1 n n_old N N_old=0A=
## <fct> <int> <int> <dbl> =
<dbl>=0A=
## 1 0 17740 17740 2579549. 2579549.=0A=
## 2 1 1187 1187 230747. 230747.=0A=
## 3 2 698 698 158682. 158682.=0A=
## 4 3 170 170 45479. 45479.=0A=
## 5 4 2661 2661 502654. 502654.=0A=
## 6 5 4457 4457 995600. 995600.=0A=
## 7 6 2150 2150 505875. 505875.=0A=
## 8 7 2470 2468 541231. 539523.=0A=
## 9 8 4205 4205 849789. 849789.=0A=
## 10 9 1588 1588 351074. =
351074.