library(XML) dir = "~/Books/XMLTechnologies/XSL/examples/Earthquakes/" doc = xmlParse(sprintf("%s%s", dir, "2004_Earthquakes_ALL.kml"))
We get all the <Placemark> nodes under the <Folder> nodes.
pl = getNodeSet(doc, "//kml:Folder/kml:Placemark", "kml")
Now we extract the longitude and latitude, the magnitude, etc.
Before we do this, let's just test one thing. Which is faster: getting the <Placemark> nodes once and then working on these to extract their children in R, or to use XPath many times to get each of the different elements in the document that we want, <coordinates>, <description>, <name> within in each <Placemark>. Let's time these
system.time({desc = sapply(pl, function(x) xmlValue(x[["description"]]))})user system elapsed 18.411 0.972 35.986system.time({desc1 = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml")})user system elapsed 0.711 0.052 1.262all(desc1 == desc)
So it is much faster (in this context) to use XPath many times rather than using R to access the children by name. Accessing by index may be different.
system.time({desc = sapply(pl, function(x) xmlValue(x[[2]]))})user system elapsed 10.201 0.697 17.039
nm = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:name", xmlValue, namespaces = "kml") #nm = sapply(pl, function(x) xmlValue(x[["name"]])) mags = gsub("^M (None|[0-9.]+) -.*", "\\1", nm) mags[mags == "None"] = "0" magnitude = as.numeric(mags) # Check the results sum(is.na(magnitude)) == 0 table(floor(magnitude))
Now get the depth from the HTML content in the description.
desc = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml") # desc = sapply(pl, function(x) xmlValue(x[["description"]])) getDepthStringFromDesc = function(x) { hdoc = htmlParse(x, asText = TRUE) xmlValue(getNodeSet(hdoc, "//tr[2]/td/font")[[1]]) } str = sapply(desc, getDepthStringFromDesc) depth = as.numeric(gsub(" .*", "", str)) # strip names sum(is.na(depth))
Let's get the date
Now the longitude and latitude
Now we assemble the data frame
readKMZEarthquakes = function(doc) { if(is.character(doc)) { if(length(grep("^http.*kmz$", doc))) { doc = xmlParse(getKMZURL(doc), asText = TRUE) } else doc = xmlParse(doc) } cat("parsed the XML file\n") <<magnitude>> <<dates>> <<description>> <<coordinates>> <<assemble>> eq }
getKMZURL = function(doc, readXML = TRUE) { kmz = getURLContent('http://neic.usgs.gov/neis/epic/kml/2009_Earthquakes_ALL.kmz') ar = zipArchive(kmz) if(readXML) invisible(xmlParse(ar[[1]], asText = TRUE)) else ar }