# > Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RT == 112]
# What do you expect to have happen when Station_RT is NA? R has no idea
# whether it is 112 or not, so R returns an "I don't know" value that
# lets the user decide how to handle the missing data, rather than making
# assumptions.
Again, sorry for my question
Stefano
________________________________________
Da: Sarah Goslee [sarah.gos...@gmail.com]
Inviato: mercoledì 7 settembre 2016 15.11
A: Stefano Sofia
Cc: r-help@r-project.org
Oggetto: Re: [R] how to manage missing values correctly when importing a data
frame
R is refusing to make unwarranted assumptions about your data.
See inline.
# it's nicer to use dput() instead of pasting raw data
Storia_RM_RT <- structure(list(Station_RM = c(1400L, 1460L, 1500L,
1520L), Sensor_RM = 2701:2704,
Place_RM = c("Novafeltria", "Carpegna", "Pesaro", "Fano"),
Y_init_RM = c(1959L, 1963L, 1957L, 1957L), M_init_RM = c(1L,
1L, 1L, 1L), D_init_RM = c(1L, 1L, 1L, 1L), Long_cent_RM = c(12.289552,
12.332614, 12.909822, 13.017591), Lat_cent_RM = c(43.890057,
43.778107, 43.910889, 43.840054), Height_RM = c(293L, 748L,
11L, 4L), Continues = c("NO", "SI", "SI", "SI"), Station_RT = c(NA,
702L, 112L, 152L), Sensor_RT = c(NA, 2954L, 1229L, 2671L),
Place_RT = c(NA, "Carpegna", "Pesaro", "Fano"), Name1_RT = c(NA,
"Carpegna", "Villa_Fastiggi", "Foce_Metauro"), Name2_RT = c(NA,
"Carpegna", "Villa_Fastiggi", "Metaurilia"), Long_cent_RT = c(NA,
12.340618, 12.86939, 13.053796), Lat_cent_RT = c(NA, 43.780575,
43.89061, 43.826328), Height_RT = c(NA, 715, 22, 7.12), Actual_net
= c("CAE",
"RT", "RT", "RT"), Notes = c(NA, NA, NA, NA), Test_20141231 = c("NO",
"NO", "YES", "YES"), Test_20151231 = c("NO", "NO", "YES",
"YES")), .Names = c("Station_RM", "Sensor_RM", "Place_RM",
"Y_init_RM", "M_init_RM", "D_init_RM", "Long_cent_RM", "Lat_cent_RM",
"Height_RM", "Continues", "Station_RT", "Sensor_RT", "Place_RT",
"Name1_RT", "Name2_RT", "Long_cent_RT", "Lat_cent_RT", "Height_RT",
"Actual_net", "Notes", "Test_20141231", "Test_20151231"), class =
"data.frame", row.names = c(NA,
-4L))
Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RM == 1500]
[1] "YES"
# Storia_RM_RT$Omogenea_20151231[Storia_RM_RT$Station_RT == 112]
# there's no such column; you probably mean Test_20151231
Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RT == 112]
[1] NA "YES"
# What do you expect to have happen when Station_RT is NA? R has no idea
# whether it is 112 or not, so R returns an "I don't know" value that
# lets the user decide how to handle the missing data, rather than making
# assumptions.
# But you probably want one of these constructions:
Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RT == 112 &
!is.na(Storia_RM_RT$Station_RT)]
# subset automatically handles NAs, making the assumption I'm assuming you want.
subset(Storia_RM_RT, Station_RT == 112 )$Test_20151231
# This is the first form, somewhat more elegantly
with(Storia_RM_RT, Test_20151231[Station_RT == 112 & !is.na(Station_RT)])
On Wed, Sep 7, 2016 at 7:09 AM, Stefano Sofia
<stefano.so...@regione.marche.it> wrote:
Dear R users,
I have a data frame with 22 columns, called Storia_RM_RT. Here the first 4 rows:
Station_RM Sensor_RM Place_RM Y_init_RM M_init_RM D_init_RM Long_cent_RM
Lat_cent_RM Height_RM Continues Station_RT Sensor_RT Place_RT Name1_RT Name2_RT
Long_cent_RT Lat_cent_RT Height_RT Actual_net Notes Test_20141231 Test_20151231
1400 2701 Novafeltria 1959 1 1 12.289552 43.890057 293 NO NA NA NA NA NA NA NA
NA CAE NA NO NO
1460 2702 Carpegna 1963 1 1 12.332614 43.778107 748 SI 702 2954 Carpegna
Carpegna Carpegna 12.340618 43.780575 715 RT NA NO NO
1500 2703 Pesaro 1957 1 1 12.909822 43.910889 11 SI 112 1229 Pesaro
Villa_Fastiggi Villa_Fastiggi 12.86939 43.890610 22 RT NA YES YES
1520 2704 Fano 1957 1 1 13.017591 43.840054 4 SI 152 2671 Fano Foce_Metauro
Metaurilia 13.053796 43.826328 7.12 RT NA YES YES
I load it with
Storia_RM_RT <- read.table(file="Storia_RM_RT.txt", header = TRUE, sep=" ", dec =
".", stringsAsFactors = FALSE)
print(Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RM == 1500]) gives
[1] "YES"
while
print(Storia_RM_RT$Omogenea_20151231[Storia_RM_RT$Station_RT == 112]) gives
[1] NA "YES"
print(lapply(Storia_RM_RT, class)) gives
$Station_RM
[1] "integer"
$Sensor_RM
[1] "integer"
$Place_RM
[1] "character"
$Y_init_RM
[1] "integer"
$M_init_RM
[1] "integer"
$D_init_RM
[1] "integer"
$Long_cent_RM
[1] "numeric"
$Lat_cent_RM
[1] "numeric"
$Height_RM
[1] "integer"
$Continues
[1] "character"
$Station_RT
[1] "integer"
$Sensor_RT
[1] "integer"
$Place_RT
[1] "character"
$Name1_RT
[1] "character"
$Name2_RT
[1] "character"
$Long_cent_RT
[1] "numeric"
$Lat_cent_RT
[1] "numeric"
$Quota_RT
[1] "numeric"
$Actual_net
[1] "character"
$Notes
[1] "logical"
$Test_20141231
[1] "character"
$Test_20151231
[1] "character"
I am struggling to understand why the query through the field Station_RT does
not work.
Could please somebody help me to manage correctly the missing values? Is the
mistake somewhere else?
Thank you
Stefano Sofia
--
Sarah Goslee
http://www.functionaldiversity.org