library(XML)
dir = "~/Books/XMLTechnologies/XSL/examples/Earthquakes/"
doc = xmlParse(sprintf("%s%s", dir, "2004_Earthquakes_ALL.kml"))
We get all the <Placemark> nodes under the <Folder> nodes.
pl = getNodeSet(doc, "//kml:Folder/kml:Placemark", "kml")
Now we extract the longitude and latitude, the magnitude, etc.
Before we do this, let's just test one thing. Which is faster: getting the <Placemark> nodes once and then working on these to extract their children in R, or to use XPath many times to get each of the different elements in the document that we want, <coordinates>, <description>, <name> within in each <Placemark>. Let's time these
system.time({desc = sapply(pl, function(x) xmlValue(x[["description"]]))})
user system elapsed
18.411 0.972 35.986
system.time({desc1 = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml")})
user system elapsed
0.711 0.052 1.262
all(desc1 == desc)
So it is much faster (in this context) to use XPath many times rather than using R to access the children by name. Accessing by index may be different.
system.time({desc = sapply(pl, function(x) xmlValue(x[[2]]))})
user system elapsed
10.201 0.697 17.039
nm = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:name", xmlValue, namespaces = "kml")
#nm = sapply(pl, function(x) xmlValue(x[["name"]]))
mags = gsub("^M (None|[0-9.]+) -.*", "\\1", nm)
mags[mags == "None"] = "0"
magnitude = as.numeric(mags)
# Check the results
sum(is.na(magnitude)) == 0
table(floor(magnitude))
Now get the depth from the HTML content in the description.
desc = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml")
# desc = sapply(pl, function(x) xmlValue(x[["description"]]))
getDepthStringFromDesc =
function(x) {
hdoc = htmlParse(x, asText = TRUE)
xmlValue(getNodeSet(hdoc, "//tr[2]/td/font")[[1]])
}
str = sapply(desc, getDepthStringFromDesc)
depth = as.numeric(gsub(" .*", "", str)) # strip names
sum(is.na(depth))
Let's get the date
Now the longitude and latitude
Now we assemble the data frame
readKMZEarthquakes =
function(doc)
{
if(is.character(doc)) {
if(length(grep("^http.*kmz$", doc))) {
doc = xmlParse(getKMZURL(doc), asText = TRUE)
} else
doc = xmlParse(doc)
}
cat("parsed the XML file\n")
<<magnitude>>
<<dates>>
<<description>>
<<coordinates>>
<<assemble>>
eq
}
getKMZURL =
function(doc, readXML = TRUE)
{
kmz = getURLContent('http://neic.usgs.gov/neis/epic/kml/2009_Earthquakes_ALL.kmz')
ar = zipArchive(kmz)
if(readXML)
invisible(xmlParse(ar[[1]], asText = TRUE))
else
ar
}