2016-09-12 1 views
0

Ich versuche, Daten von einem USGS-Post-Formular mit RVST herunterladen. Was mache ich falsch?RVest Problem: POST-Übermittlung

make_url = function(base_url, parameter_list) 
    parameter_list %>% 
    names %>% 
    paste(parameter_list, sep = "=", collapse = "&") %>% 
    paste(base_url, ., sep = "") 

session = 
    list(sn = "01170000") %>% 
    make_url("http://ida.water.usgs.gov/ida/available_records.cfm?", .) %>% 
    html_session 

test = 
    session %>% 
    html_form %>% 
    .[[1]] %>% 
    set_values(fromdate = "1990-10-01") %>% 
    set_values(todate = "2007-09-30") %>% 
    set_values(rtype = "3") %>% 
    submit_form(session, .) 

Antwort

2

Keine Notwendigkeit für rvest oder eine Sitzung. Die folgende Funktion nimmt Station und Daten auf und gibt einen Datenrahmen mit dem Datendateikommentar zurück, den die USGS bei jedem Download ausspuckt.

Es verwendet die Option "komprimierte Datei herunterladen", um Bandbreite zu sparen und den Download zu beschleunigen. Es erstellt temporäre Dateien, um die Daten zu lesen, räumt jedoch auf. Spalten werden in den richtigen Typ konvertiert (Sie können diesen Teil des Codes jedoch weglassen, wenn Sie das möchten). Du kannst den Kommentar auch weglassen, wenn du ihn nicht brauchst (er schien mir nützliche Informationen zu haben).

readr::read_lines() wird für die Geschwindigkeit verwendet und Sie können stattdessen readLines() verwenden, wenn Sie sich nicht auf das readr-Paket verlassen möchten.

Die Umwandlung in eine tibble Version eines data.frame ist vor allem für eine bessere Druck aber es hat andere potenzielle Vorteile, so können Sie weglassen, dass auch wenn Sie nicht wollen, auf dem tibble Paket verlassen.

Es gibt eine hartcodierte 99-Sekunden-Zeitüberschreitung, aber Sie können sie bei Bedarf parametrieren.

library(httr) 
library(readr) 
library(tibble) 

#' Retrieve IDA Station Data 
#' 
#' @param site_no site id 
#' @param date_from records from date YYYY-mm-dd 
#' @param date_to records to date YYYY-mm-dd 
#' @return a parsed, type-converted data frame with a comments attribute. 
#' @example 
#' deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30") 
#' 
#' head(deerfield) 
#' 
#' cat(comment(deerfield)) 

get_ida <- function(site_no, date_from, date_to) { 

    date_from_time <- sprintf("%s 00:15:00.0", date_from) 
    date_to_time <- sprintf("%s 23:45:00.0", date_to) 

    ida_referer <- sprintf("http://ida.water.usgs.gov/ida/available_records.cfm?sn=%s", site_no) 

    tf <- tempfile(".zip") 

    res <- POST(url = "http://ida.water.usgs.gov/ida/available_records_process.cfm", 
       body = list(fromdate = date_from, 
          todate = date_to, 
          mindatetime = date_from_time, 
          maxdatetime = date_to_time, 
          site_no = site_no, 
          rtype = "2", 
          submit1 = "Retrieve+Data"), 
       add_headers(Origin="http://ida.water.usgs.gov", 
          Referer=ida_referer), 
       write_disk(tf), 
       timeout(99), 
       encode = "form") 

    fils <- unzip(tf, exdir=tempdir()) 
    tmp <- read_lines(fils) 

    unlink(tf) 
    unlink(fils) 

    comments <- grep("^#", tmp, value=TRUE) 
    records <- grep("^#", tmp, value=TRUE, invert=TRUE) 
    header <- records[1:2] 
    records <- records[-(1:2)] 
    cols <- strsplit(header[1], "[[:space:]]+")[[1]] 

    comments <- paste0(comments, collapse="\n") 
    records <- paste0(records, collapse="\n") 

    df <- read_tsv(records, col_names=cols, "cccnnnnc") 
    df$date_time <- as.POSIXct(df$date_time, format="%Y%m%d%H%M%S") 
    df <- as_tibble(df) 

    comment(df) <- comments 

    df 

} 

Proof es funktioniert:

deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30") 

dplyr::glimpse(deerfield) 
## Observations: 550,917 
## Variables: 8 
## $ site_no  <chr> "01170000", "01170000", "01170000", "01170000", "0117000... 
## $ date_time <time> 1990-10-01 00:15:00, 1990-10-01 00:30:00, 1990-10-01 00... 
## $ tz_cd  <chr> "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", ... 
## $ dd   <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,... 
## $ accuracy_cd <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... 
## $ value  <dbl> 146, 139, 135, 143, 154, 166, 171, 175, 171, 166, 162, 1... 
## $ prec  <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,... 
## $ remark  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... 

head(deerfield) 
## # A tibble: 6 x 8 
## site_no   date_time tz_cd dd accuracy_cd value prec remark 
##  <chr>    <time> <chr> <dbl>  <dbl> <dbl> <dbl> <chr> 
## 1 01170000 1990-10-01 00:15:00 EDT  7   1 146  3 <NA> 
## 2 01170000 1990-10-01 00:30:00 EDT  7   1 139  3 <NA> 
## 3 01170000 1990-10-01 00:45:00 EDT  7   1 135  3 <NA> 
## 4 01170000 1990-10-01 01:00:00 EDT  7   1 143  3 <NA> 
## 5 01170000 1990-10-01 01:15:00 EDT  7   1 154  3 <NA> 
## 6 01170000 1990-10-01 01:30:00 EDT  7   1 166  3 <NA> 

cat(comment(deerfield)) 
# retrieved: 2016-09-12 05:32:34 CST 
# 
# Data for the following station is contained in this file 
# --------------------------------------------------------- 
# USGS 01170000 DEERFIELD RIVER NEAR WEST DEERFIELD, MA 
# 
# This data file was retrieved from the USGS 
# instantaneous data archive at 
# http://ida.water.usgs.gov 
# 
# ---------------------WARNING--------------------- 
# The instantaneous data you have obtained from 
# this automated U.S. Geological Survey database 
# may or may not have been the basis for the published 
# daily mean discharges for this station. Although 
# automated filtering has been used to compare these 
# data to the published daily mean values and to remove 
# obviously bad data, there may still be significant 
# error in individual values. Users are strongly 
# encouraged to review all data carefully prior to use. 
# These data are released on the condition that neither 
# the USGS nor the United States Government may be held 
# liable for any damages resulting from its use. 
# 
# This file consists of tab-separated columns of the 
# following fields. 
# 
# column  column definition 
# ----------- ----------------------------------------- 
# site_no  USGS site identification number 
# date_time  date and time in format (YYYYMMDDhhmmss) 
# tz_cd  time zone 
# dd   internal USGS sensor designation (''data descriptor'') 
# accuracy_cd accuracy code 
#     0 - A daily mean discharge calculated from the instantaneous 
#      data on this day is 0.01 cubic feet per second 
#      or less and the published daily mean is zero. 
#     1 - A daily mean discharge calculated from the instantaneous 
#      data on this day matches the published daily mean 
#      within 1 percent. 
#     2 - A daily mean discharge calculated from the instantaneous 
#      data on this day matches the published daily mean 
#      from greater than 1 to 5 percent. 
#     3 - A daily mean discharge calculated from the instantaneous 
#      values on this day matches the published daily mean 
#      from greater than 5 to 10 percent. 
#     9 - The instantaneous value is considered correct by the 
#      collecting USGS Water Science Center. A published daily 
#      mean value does not exist and/or no comparison was made. 
# value  discharge in cubic feet per second 
# precision digits of precision in the discharge 
# remark  optional remark code 
#     Remark Explanation 
#     <  Actual value is known to be less than reported value. 
#     >  Actual value is known to be greater than reported value. 
#     &  Value is affected by unspecified reasons. 
#     A  Value is affected by ice at the measurement site. 
#     B  Value is affected by backwater at the measurement site. 
#     e  Value has been estimated by USGS personnel. 
#     E  Value was computed from an estimated value. 
#     F  Value was modified due to automated filtering. 
#     K  Value is affected by instrument calibration drift. 
#     R  Rating is undefined for this value. 
# 
# 
+0

Große Antwort! Wirklich über und über. Es scheint, als ob es in einem Paket nett sein könnte? Ich würde immer noch gerne sehen, ob es eine Möglichkeit gibt, Geld zu verdienen. Die hadleyverse Syntax ist so nett. – bramtayl

+0

Nun, 'httr' _technisch_ ist Teil des hadleyverse ;-) Ich dachte über die gleiche Sache w/r/ta Potenzial pkg, aber die IDA-Website sagt, dass alle Daten schließlich auf die NWIS-Website bewegt und es gibt bereits eine R pkg zu Greife auf diese Daten zu - https://cran.r-project.org/web/packages/waterData/waterData.pdf - aber wenn sie das Archiv alleine halten, wäre es etwas, worüber ich nachdenken würde. – hrbrmstr

+0

Es ist erwähnenswert, dass diese Daten seit mindestens zwei Jahren von der Haupt-USGS-Site getrennt sind. – bramtayl

0

Ok, hier ist ein Weg, rvest an der Arbeit:

library(magrittr) 

make_url = function(base_url, parameter_list = list(), ...) { 
    together_list = 
    parameter_list %>% 
    c(list(...)) 

    together_list %>% 
    names %>% 
    paste(together_list, sep = "=", collapse = "&") %>% 
    paste(base_url, ., sep = "?") 
} 

download_ida = function(site_no, 
         fromdate = "1990-10-01", 
         todate = "2007-09-30", 
         dir = ".", 
         filename = paste(site_no, "txt", sep = ".")) { 

    session = 
    "http://ida.water.usgs.gov/ida/available_records.cfm" %>% 
    make_url(sn = "01170000") %>% 
    html_session 

    form = 
    session %>% 
    html_form %>% 
    .[[1]] %>% 
    set_values(fromdate = fromdate, 
       todate = todate, 
       rtype = "2") 

    tempfile = tempfile(".zip") 

    submit_form(session, form, submit = NULL, 
       httr::write_disk(tempfile, 
           overwrite = TRUE), 
       httr::add_headers(Referer = session$url)) 

    filename = file.path(dir, filename) 

    tempfile %>% 
    unzip(exdir = dir) %>% 
    file.rename(filename) 

    filename 
} 

read_ida = function(filename) { 

    col_names = 
    filename %>% 
    readr::read_tsv(comment = "#", n_max = 1, col_names = FALSE) 

    filename %>% 
    readr::read_tsv(comment = "#", skip= 2, col_names = FALSE, na = "Ice", 
        col_types = cols(X2 = col_datetime(format = "%Y%m%d%H%M%S"))) %>% 
    stats::setNames(col_names) 
} 

deerfield = 
    "01170000" %>% 
    download_ida %>% 
    read_ida 

Aber es gibt eine Einschränkung: rvest derzeit eine offene Pull-Anforderung hat, https://github.com/hadley/rvest/pull/161 , die benötigt wird, um dies zur Arbeit zu bringen. Zu diesem Zweck ist es notwendig, sowohl submit_request und submit_form Integration der neuen Pull-Anforderung neu zu definieren:

submit_request = function(form, submit = NULL) { 
    is_submit <- function(x) 
    if (is.null(x$type)) FALSE else 
     tolower(x$type) %in% c("submit", "image", "button") 

    submits <- Filter(is_submit, form$fields) 

    if (length(submits) == 0) { 
    stop("Could not find possible submission target.", call. = FALSE) 
    } 
    if (is.null(submit)) { 
    submit <- names(submits)[[1]] 
    message("Submitting with '", submit, "'") 
    } 
    if (!(submit %in% names(submits))) { 
    stop("Unknown submission name '", submit, "'.\n", "Possible values: ", 
     paste0(names(submits), collapse = ", "), call. = FALSE) 
    } 
    other_submits <- setdiff(names(submits), submit) 
    method <- form$method 
    if (!(method %in% c("POST", "GET"))) { 
    warning("Invalid method (", method, "), defaulting to GET", 
      call. = FALSE) 
    method <- "GET" 
    } 
    url <- form$url 
    fields <- form$fields 
    fields <- Filter(function(x) length(x$value) > 0, fields) 
    fields <- fields[setdiff(names(fields), other_submits)] 
    values <- pluck(fields, "value") 
    names(values) <- names(fields) 
    list(method = method, encode = form$enctype, url = url, values = values) 
} 

submit_form = function(session, form, submit = NULL, ...) { 
    request <- submit_request(form, submit) 
    url <- xml2::url_absolute(form$url, session$url) 
    if (request$method == "GET") { 
    rvest:::request_GET(session, url = url, query = request$values, ...) 
    } else if (request$method == "POST") { 
    rvest:::request_POST(session, url = url, body = request$values, 
         encode = request$encode, ...) 
    } else { 
    stop("Unknown method: ", request$method, call. = FALSE) 
    } 
} 

Hoffentlich wird die Pull-Anforderung bald verschmolzen bekommen.