Hello, 

 

I am trying to read in elements out of a very large binary file ... the
total file is 4 gigs. I want to select rows out of the file, and the
current procedure I run works but is prohibitively slow (takes more than
a day to run and still won't complete). Is there any faster way to
accomplish this?

 

My current procedure looks like this:

 

readHH <- function(file_name, hhid_list) {

incon=file(file_name, open="rb")

result=data.frame()

tran=list()

byte_mark=0

last_1M_mod=0

file_size=file.info(file_name)$size

write.table(paste("Data pulled from", file_name, sep=" "),
file="readHH_output.txt", sep=",", row.names=FALSE, col.names=FALSE,
append=TRUE)

while (TRUE) {

    tran$hh_id <- readBin(incon,integer(),1,size=4)

    if(is.element(tran$hh_id, hhid_list)) {

        tran$prov_id <- readBin(incon,integer(),1,size=2)

        tran$txn_dn <- readBin(incon,integer(),1,size=2)

        tran$total_dollars <- readBin(incon,integer(),1,size=4)

        tran$total_items <- readBin(incon,integer(),1,size=4)

        tran$order_id <- readBin(incon,integer(),1,size=4)

        tran$txn_type <- readChar(incon,1)

        tran$gender <- readChar(incon,1)

        tran$zip_code <- readChar(incon,5)

        tran$region_code <- readChar(incon,1)

        tran$county_code <- readChar(incon,1)

        tran$state_abbrev <- readChar(incon,2)

        tran$channel_code <- readChar(incon,1)

        tran$source_code <- readChar(incon,20)

        tran$payment_type <- readChar(incon,1)

        tran$credit_card <- readChar(incon,1)

        tran$promo_type <- readChar(incon,1)

        tran$flags <- readChar(incon,1)

        write.table(data.frame(tran), file="readHH_output", sep=",",
row.names=FALSE, col.names=FALSE, append=TRUE)

        result <- rbind(result,data.frame(tran))

        }

    else {

        byte_mark=byte_mark+42

        if (byte_mark>=file_size) {break}

        else {seek(incon, where=byte_mark)}

        }

    }

return(result)

}

 

Thanks

 

Matt

 

 

 

 

Matt Anthony | Senior Statistician| 303.327.1761 |
[EMAIL PROTECTED]
10155 Westmoor Drive | Westminster, CO 80021 | FAX 303.327.1650

 

 


        [[alternative HTML version deleted]]

______________________________________________
R-help@stat.math.ethz.ch mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to