2017-11-23 6 views
0

Ich habe eine JSON-Datei 'data.json', die Informationen über verschiedene Orte von Interesse enthält. Konvertierung von verschachtelten JSON ungleicher Länge in Datenrahmen in R

data = lapply(readLines("data.json"), fromJSON) 

Dies erzeugt eine verschachtelte Liste mit unterschiedlichen Längen. Hier ist ein Beispiel für die ersten 4 Zeilen.

list(structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.9459720|-2.1971226|20|within_50m|4\"]", 
    latitude = "56.945972", locality = "Stonehaven", `_records_touched` = "{\"crawl\":8,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "The Lodge, Dunottar", email = "[email protected]", 
    existence_ml = 0.569423821765872, domain_aggregate = "", 
    name = "Dunnottar Castle", search_tags = c("Dunnottar Castle Aberdeenshire", 
    "Dunotter Castle"), admin_region = "Scotland", existence = 1L, 
    category_labels = structure(c("Landmarks", "Buildings and Structures" 
    ), .Dim = 1:2), post_town = "Stonehaven", region = "Kincardineshire", 
    review_count = "719", geocode_level = "within_50m", tel = "01569 762173", 
    placerank = 65L, longitude = "-2.197123", placerank_ml = 37.2791607346447, 
    fax = "01330 860325", category_ids_text_search = "", website = "http://www.dunnottarcastle.co.uk", 
    status = "1", geocode_confidence = "20", postcode = "AB39 2TL", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "4"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "existence_ml", "domain_aggregate", "name", "search_tags", 
"admin_region", "existence", "category_labels", "post_town", 
"region", "review_count", "geocode_level", "tel", "placerank", 
"longitude", "placerank_ml", "fax", "category_ids_text_search", 
"website", "status", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality")), uuid = "3867aaf3-12ab-434f-b12b-5d627b3359c3"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.237480|-5.073578|20|within_50m|4\"]", 
    latitude = "56.237480", locality = "Inveraray", `_records_touched` = "{\"crawl\":11,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "Cherry Park", email = "[email protected]", 
    longitude = "-5.073578", domain_aggregate = "", name = "Inveraray Castle", 
    admin_region = "Scotland", search_tags = c("Inveraray Castle Tea Room", 
    "Inverary Castle"), existence = 1L, category_labels = structure(c("Social", 
    "Food and Dining", "Restaurants"), .Dim = c(1L, 3L)), region = "Argyll", 
    review_count = "532", geocode_level = "within_50m", tel = "01499 302203", 
    placerank = 67L, post_town = "Inveraray", placerank_ml = 41.1997808735227, 
    fax = "01499 302421", category_ids_text_search = "", website = "http://www.inveraray-castle.com", 
    status = "1", geocode_confidence = "20", postcode = "PA32 8XE", 
    category_ids = 347L, country = "gb", `_geocode_quality` = "4", 
    existence_ml = 0.791488110284778), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "geocode_confidence", 
"postcode", "category_ids", "country", "_geocode_quality", "existence_ml" 
)), uuid = "8278ab80-2cd1-4dbd-9685-0d0036b681eb"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"51.483872|-0.606820|100|rooftop|2\"]", 
    latitude = "51.483872", locality = "Windsor Castle", hours_display = "Mon-Sat 11:30 AM-11:00 PM; Sun 12:00 PM-11:00 PM", 
    `_records_touched` = "{\"crawl\":7,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":2,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "", longitude = "-0.606820", domain_aggregate = "", 
    name = "Windsor Castle", admin_region = "England", search_tags = c("The Windsor Castle", 
    "The Windsor Castle Pub", "The Windsor Castle Public House", 
    "Pub Food", "British"), existence = 1L, category_labels = structure(c("Landmarks", 
    "Buildings and Structures"), .Dim = 1:2), region = "Berkshire", 
    review_count = "", geocode_level = "rooftop", tel = "020 7766 7304", 
    placerank = 62L, post_town = "Windsor", placerank_ml = 28.1160845346327, 
    fax = "01753 832290", category_ids_text_search = "", website = "http://www.royalcollection.org.uk/visit/windsorcastle", 
    status = "1", hours = "{\"monday\":[[\"11:30\",\"23:00\"]],\"tuesday\":[[\"11:30\",\"23:00\"]],\"wednesday\":[[\"11:30\",\"23:00\"]],\"thursday\":[[\"11:30\",\"23:00\"]],\"friday\":[[\"11:30\",\"23:00\"]],\"saturday\":[[\"11:30\",\"23:00\"]],\"sunday\":[[\"12:00\",\"23:00\"]]}", 
    neighborhood = "Chalvey", geocode_confidence = "100", postcode = "SL4 1NJ", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "2", 
    existence_ml = 0.885705196944165, email = "[email protected]"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "hours_display", "_records_touched", 
"address", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "hours", 
"neighborhood", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality", "existence_ml", "email")), uuid = "c5f7d8a9-0851-46ef-8da7-ad55e187d3a8"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    category_ids_text_search = "", placerank_ml = 31.9857184762157, 
    longitude = "-2.191955", name = "Pitmedden Garden", domain_aggregate = "", 
    admin_region = "Scotland", languages = "English", region = "Aberdeenshire", 
    review_count = "2", geocode_level = "rooftop", tel = "01651 842352", 
    placerank = 57L, post_town = "Ellon", category_labels = structure(c("Landmarks", 
    "Gardens"), .Dim = 1:2), existence = 1L, fax = "0844 493 2102", 
    website = "http://www.nts.org.uk/Property/Pitmedden-Garden", 
    status = "1", geocode_confidence = "100", postcode = "AB41 7PD", 
    country = "gb", category_ids = 109L, `_geocode_quality` = "4", 
    existence_ml = 0.849871115334588, email = "[email protected]", 
    address = "", `_records_touched` = "{\"crawl\":6,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    locality = "Pitmedden", latitude = "57.343233", geo_virtual = "[\"57.343233|-2.191955|100|rooftop|4\"]"), .Names = c("existence_full", 
"category_ids_text_search", "placerank_ml", "longitude", "name", 
"domain_aggregate", "admin_region", "languages", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "category_labels", 
"existence", "fax", "website", "status", "geocode_confidence", 
"postcode", "country", "category_ids", "_geocode_quality", "existence_ml", 
"email", "address", "_records_touched", "locality", "latitude", 
"geo_virtual")), uuid = "bb57a153-740f-42be-aa4d-ae12d4eb57d4"), .Names = c("payload", 
"uuid"))) 

Ich möchte dies in einen Datenrahmen konvertieren, indem Werte über verschiedene Spalten in der Liste der Listen gefüllt werden. Jede Liste in der Liste enthält Informationen zu einem bestimmten Ort. Dieser ist kategorisiert unter uuid. So wird jede Zeile im Datenrahmen Informationen über eine bestimmte uuid enthalten. Für Spalten, die keine entsprechenden Werte haben, sollte NA erscheinen.

Ich habe versucht, einige der genannten Ansätze in Fragen ähnlich diesem Konzept, aber war nicht erfolgreich.

Alle Gedanken würden sehr geschätzt werden! Danke

Antwort

0

Es hätte helfen können, eine umfangreichere Beschreibung des ursprünglichen Datenlayouts gehabt zu haben, aber hier ist eine Vermutung, basierend auf was ich als die High-Level-Struktur des Objekts sehe. Es sei angenommen, dass structure ist dat genannt.

> lapply(dat, names) 
[[1]] 
[1] "payload" "uuid" 

[[2]] 
[1] "payload" "uuid" 

[[3]] 
[1] "payload" "uuid" 

[[4]] 
[1] "payload" "uuid" 

So extrahieren Sie sie auf Listen von Datenrahmen

payloads <- lapply(dat, function(x) data.frame(x$payload)) 
uuids <- lapply(dat, function(x) data.frame(x$uuid)) 

sie Dann binden "Seite an Seite"

newdat <- mapply(cbind, payloads, uuids) 

Schauen Sie sich die Dimensionen Prüfen Sie, ob die Daten der einzelnen Zeilen ordnungsgemäß über die mehrzeiligen Datenrahmen kopiert werden. Das einzige Merkmal, das nicht zu Ihren Spezifikationen gehört, sind die NAs. alle

> lapply(payloads, dim) 
[[1]] 
[1] 2 32 

[[2]] 
[1] 2 33 

[[3]] 
[1] 5 35 

[[4]] 
[1] 1 32 

> lapply(uuids, dim) 
[[1]] 
[1] 1 1 

[[2]] 
[1] 1 1 

[[3]] 
[1] 1 1 

[[4]] 
[1] 1 1 

> lapply(mapply(cbind, payloads, uuids), dim) 
[[1]] 
[1] 2 33 

[[2]] 
[1] 2 34 

[[3]] 
[1] 5 36 

[[4]] 
[1] 1 33 

Die nächste Stufe der Konsolidierung montieren könnte die: Da die ‚UUID‘ sind offenbar Kennungen, die cbind Operationen jeder ihrer Spalteninhalte auf Spalten der gleichen Längen wie die ‚Nutzlasten‘ wird kopiert haben Datenrahmen „auf der jeweils anderen, da ihre Namen sind so ähnlich:

install.packages("plyr") 
newdat3 <- do.call(plyr::rbind.fill, newdat) 
newdat3 
:

lapply(newdat, names) 
[[1]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "X_records_touched"  "address"     
[7] "email"     "existence_ml"    "domain_aggregate"   
[10] "name"      "search_tags"    "admin_region"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "post_town"    "region"     "review_count"    
[19] "geocode_level"   "tel"      "placerank"    
[22] "longitude"    "placerank_ml"    "fax"      
[25] "category_ids_text_search" "website"     "status"     
[28] "geocode_confidence"  "postcode"     "category_ids"    
[31] "country"     "X_geocode_quality"  "x.uuid"     

[[2]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "X_records_touched"  "address"     
[7] "email"     "longitude"    "domain_aggregate"   
[10] "name"      "admin_region"    "search_tags"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "category_labels.3"  "region"     "review_count"    
[19] "geocode_level"   "tel"      "placerank"    
[22] "post_town"    "placerank_ml"    "fax"      
[25] "category_ids_text_search" "website"     "status"     
[28] "geocode_confidence"  "postcode"     "category_ids"    
[31] "country"     "X_geocode_quality"  "existence_ml"    
[34] "x.uuid"     

[[3]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "hours_display"   "X_records_touched"  
[7] "address"     "longitude"    "domain_aggregate"   
[10] "name"      "admin_region"    "search_tags"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "region"     "review_count"    "geocode_level"   
[19] "tel"      "placerank"    "post_town"    
[22] "placerank_ml"    "fax"      "category_ids_text_search" 
[25] "website"     "status"     "hours"     
[28] "neighborhood"    "geocode_confidence"  "postcode"     
[31] "category_ids"    "country"     "X_geocode_quality"  
[34] "existence_ml"    "email"     "x.uuid"     

[[4]] 
[1] "existence_full"   "category_ids_text_search" "placerank_ml"    
[4] "longitude"    "name"      "domain_aggregate"   
[7] "admin_region"    "languages"    "region"     
[10] "review_count"    "geocode_level"   "tel"      
[13] "placerank"    "post_town"    "category_labels.1"  
[16] "category_labels.2"  "existence"    "fax"      
[19] "website"     "status"     "geocode_confidence"  
[22] "postcode"     "country"     "category_ids"    
[25] "X_geocode_quality"  "existence_ml"    "email"     
[28] "address"     "X_records_touched"  "locality"     
[31] "latitude"     "geo_virtual"    "x.uuid"  

Die rbind.fill Funktion in plyr -package Hadley dies effizient tun können So

bei einigen der Spalten suchen, erscheint dies Ihre Anforderungen erfüllt zu haben:

> newdat3[ , c("locality", "category_labels.3", "neighborhood")] 
     locality category_labels.3 neighborhood 
1  Stonehaven    <NA>   <NA> 
2  Stonehaven    <NA>   <NA> 
3  Inveraray  Restaurants   <NA> 
4  Inveraray  Restaurants   <NA> 
5 Windsor Castle    <NA>  Chalvey 
6 Windsor Castle    <NA>  Chalvey 
7 Windsor Castle    <NA>  Chalvey 
8 Windsor Castle    <NA>  Chalvey 
9 Windsor Castle    <NA>  Chalvey 
10  Pitmedden    <NA>   <NA> 
Verwandte Themen