http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_sensor.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_sensor.xq new file mode 100644 index 0000000..15b5160 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_sensor.xq @@ -0,0 +1,29 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Count all the weather sensor readings on 1976-07-04. +:) +count( + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11)) + where $date eq xs:date("1976-07-04") + return $r +) \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_station.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_station.xq new file mode 100644 index 0000000..d21fe37 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q04_count_station.xq @@ -0,0 +1,28 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Count all the weather stations for Washington state. +:) +count( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + where (some $x in $s/locationLabels satisfies ($x/type eq "ST" and fn:upper-case(fn:data($x/displayName)) eq "WASHINGTON")) + return $s +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05.xq new file mode 100644 index 0000000..c95f3f5 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05.xq @@ -0,0 +1,33 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: XQuery Join Aggregate Query :) +(: Find the lowest recorded temperature (TMIN) in the United States for :) +(: 2001. :) +fn:min( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + + where $s/id eq $r/station + and (some $x in $s/locationLabels satisfies ($x/type eq "CNTRY" and $x/id eq "FIPS:US")) + and $r/dataType eq "TMIN" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2001 + return $r/value +) div 10 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_join.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_join.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_join.xq new file mode 100644 index 0000000..76e3458 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_join.xq @@ -0,0 +1,35 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Aggregate Query +------------------- +Find the lowest recorded temperature (TMIN) in the United States for 2001. +:) +fn:count( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + + where $s/id eq $r/station + and (some $x in $s/locationLabels satisfies ($x/type eq "CNTRY" and $x/id eq "FIPS:US")) + and $r/dataType eq "TMIN" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2001 + return $r +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_sensor.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_sensor.xq new file mode 100644 index 0000000..3b1046b --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_sensor.xq @@ -0,0 +1,31 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Aggregate Query +------------------- +Count all sensor readings for TMIN in 2001. +:) +count( + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + + let $date := xs:date(fn:substring(xs:string(fn:data($r/date)), 0, 11)) + where $r/dataType eq "TMIN" + and fn:year-from-date($date) eq 2001 + return $r/value +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_station.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_station.xq new file mode 100644 index 0000000..7c2a7ef --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q05_count_station.xq @@ -0,0 +1,28 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Aggregate Query +------------------- +Count all stations in the United States. +:) +count( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + where (some $x in $s/locationLabels satisfies ($x/type eq "CNTRY" and $x/id eq "FIPS:US")) + return $s +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq new file mode 100644 index 0000000..5c8ed54 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06.xq @@ -0,0 +1,30 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: XQuery Join Query :) +(: Find the highest recorded temperature (TMAX) for each station for each :) +(: day over the year 2000. :) +let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" +for $s in collection($station_collection)/stationCollection/station + +let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" +for $r in collection($sensor_collection)/dataCollection/data + +where $s/id eq $r/station + and $r/dataType eq "TMAX" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2000 +return ($s/displayName, $r/date, $r/value) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_join.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_join.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_join.xq new file mode 100644 index 0000000..bad6406 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_join.xq @@ -0,0 +1,34 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Find the highest recorded temperature (TMAX) for each station for each day over the year 2000. +:) +fn:count( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + + where $s/id eq $r/station + and $r/dataType eq "TMAX" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2000 + return $r +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_sensor.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_sensor.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_sensor.xq new file mode 100644 index 0000000..54d81c6 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_sensor.xq @@ -0,0 +1,29 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Count max temperature (TMAX) readings for 2000-01-01. +:) +count( + let $sensor_collection := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r in collection($sensor_collection)/dataCollection/data + where $r/dataType eq "TMAX" + and fn:year-from-dateTime(xs:dateTime(fn:data($r/date))) eq 2000 + return $r +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_station.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_station.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_station.xq new file mode 100644 index 0000000..c94dc78 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q06_count_station.xq @@ -0,0 +1,27 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Count all the stations. +:) +count( + let $station_collection := "/tmp/1.0_partition_ghcnd_all_xml/stations" + for $s in collection($station_collection)/stationCollection/station + return $s +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq new file mode 100644 index 0000000..5b1f2ac --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07.xq @@ -0,0 +1,33 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: XQuery Self Join Query :) +(: Self join with all stations finding the difference in min and max :) +(: temperature and get the average. :) +fn:avg( + let $sensor_collection_min := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_min in collection($sensor_collection_min)/dataCollection/data + + let $sensor_collection_max := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_max in collection($sensor_collection_max)/dataCollection/data + + where $r_min/station eq $r_max/station + and $r_min/date eq $r_max/date + and $r_min/dataType eq "TMIN" + and $r_max/dataType eq "TMAX" + return $r_max/value - $r_min/value +) div 10 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_join.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_join.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_join.xq new file mode 100644 index 0000000..0ddada0 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_join.xq @@ -0,0 +1,35 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Find the all the records for TMIN. +:) +fn:count( + let $sensor_collection_min := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_min in collection($sensor_collection_min)/dataCollection/data + + let $sensor_collection_max := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_max in collection($sensor_collection_max)/dataCollection/data + + where $r_min/station eq $r_max/station + and $r_min/date eq $r_max/date + and $r_min/dataType eq "TMIN" + and $r_max/dataType eq "TMAX" + return $r_max +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmax.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmax.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmax.xq new file mode 100644 index 0000000..0b5511f --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmax.xq @@ -0,0 +1,28 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Find the all the records for TMAX. +:) +count( + let $sensor_collection_max := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_max in collection($sensor_collection_max)/dataCollection/data + where $r_max/dataType eq "TMAX" + return $r_max +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmin.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmin.xq b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmin.xq new file mode 100644 index 0000000..fda029a --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/queries/q07_count_tmin.xq @@ -0,0 +1,28 @@ +(: Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. :) + +(: +XQuery Join Query +------------------- +Find the all the records for TMIN. +:) +count( + let $sensor_collection_min := "/tmp/1.0_partition_ghcnd_all_xml/sensors" + for $r_min in collection($sensor_collection_min)/dataCollection/data + where $r_min/dataType eq "TMIN" + return $r_min +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md new file mode 100644 index 0000000..58bea51 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md @@ -0,0 +1,51 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +Weather Data Conversion To XML +===================== + +# Introduction + +The NOAA has hosted DAILY GLOBAL HISTORICAL CLIMATOLOGY NETWORK (GHCN-DAILY) +.dat files. Weather.gov has an RSS/XML feed that gives current weather sensor +readings. Using the RSS feed as a template, the GHCN-DAILY historical +information is used to generate past RSS feed XML documents. The process allows +testing on a large set of information with out having to continually monitor +the weather.gov site for all the weather details for years. + +# Detailed Description + +Detailed GHDN-DAILY information: +<http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt> + +The process takes a save folder for the data. The folder contains a several +folders: + + - all_xml_files (The generated xml files for a given package) + - downloads (All files taken from the NOAA HTTP site) + - dataset-[name] (all files related to a single dataset) + + +# Examples commands + +Building + + +Partitioning +python weather_cli.py -x weather_example.xml + +Linking \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties new file mode 100644 index 0000000..2fb0af0 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties @@ -0,0 +1 @@ +java.util.logging.ConsoleHandler.level=OFF \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh new file mode 100755 index 0000000..88339bd --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Run all the queries and save a log. +# First argument: Supply the folder which houses all the queries (recursive). +# Second argument: adds options to the VXQuery CLI. +# +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ "-client-net-ip-address 169.235.27.138" +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ "" q03 +# +REPEAT=5 +IGNORE=2 +FRAME_SIZE=$((8*1024)) +BUFFER_SIZE=$((32*1024*1024)) +JOIN_HASH_SIZE=-1 + +if [ -z "${1}" ] +then + echo "Please supply a directory for query files to be found." + exit +fi + +export JAVA_OPTS="$JAVA_OPTS -server -Xmx8G -XX:+HeapDumpOnOutOfMemoryError -Djava.util.logging.config.file=./vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties" + +for j in $(find ${1} -name '*q??.xq') +do + if [ -z "${3}" ] || [[ "${j}" =~ "${3}" ]] + then + date + echo "Running query: ${j}" + log_file="$(basename ${j}).$(date +%Y%m%d%H%M).log" + log_base_path=$(dirname ${j/queries/query_logs}) + mkdir -p ${log_base_path} + time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -join-hash-size ${JOIN_HASH_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + echo "\nBuffer Size: ${BUFFER_SIZE}" >> ${log_base_path}/${log_file} + echo "\nFrame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} + echo "\nJoin Hash Size: ${JOIN_HASH_SIZE}" >> ${log_base_path}/${log_file} + fi; +done + +if which programname >/dev/null; +then + echo "Sending out e-mail notification." + SUBJECT="Benchmark Tests Finished" + EMAIL="[email protected]" + /bin/mail -s "${SUBJECT}" "${EMAIL}" <<EOM + Completed all tests in folder ${1}. + EOM +else + echo "No mail command to use." +fi; http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh new file mode 100755 index 0000000..98ab04b --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Run all the queries and save a log. +# First argument: Supply the folder which houses all the queries (recursive). +# Second argument: adds options to the VXQuery CLI. +# +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ "-client-net-ip-address 169.235.27.138" +# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ "" q03 +# +CLUSTER="uci" +REPEAT=5 +FRAME_SIZE=$((8*1024)) +BUFFER_SIZE=$((32*1024*1024)) +#JOIN_HASH_SIZE=$((256*1024*1024)) +JOIN_HASH_SIZE=-1 + +if [ -z "${1}" ] +then + echo "Please supply a directory for query files to be found." + exit +fi + +if [ -z "${2}" ] +then + echo "Please the number of nodes (start at 0)." + exit +fi + +# Run queries for the specified number of nodes. +echo "Starting ${2} cluster nodes" +python vxquery-server/src/main/resources/scripts/cluster_cli.py -c vxquery-server/src/main/resources/conf/${CLUSTER}/${2}nodes.xml -a start + +# wait for cluster to finish setting up +sleep 5 + +export JAVA_OPTS="$JAVA_OPTS -server -Xmx8G -XX:+HeapDumpOnOutOfMemoryError -Djava.util.logging.config.file=./vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/benchmark_logging.properties" + +for j in $(find ${1} -name '*q??.xq') +do + # Only work with i nodes. + if [[ "${j}" =~ "${2}nodes" ]] + then + # Only run for specified queries. + if [ -z "${4}" ] || [[ "${j}" =~ "${4}" ]] + then + date + echo "Running query: ${j}" + log_file="$(basename ${j}).$(date +%Y%m%d%H%M).log" + log_base_path=$(dirname ${j/queries/query_logs}) + mkdir -p ${log_base_path} + time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${3} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -join-hash-size ${JOIN_HASH_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + echo "\nBuffer Size: ${BUFFER_SIZE}" >> ${log_base_path}/${log_file} + echo "\nFrame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} + echo "\nJoin Hash Size: ${JOIN_HASH_SIZE}" >> ${log_base_path}/${log_file} + fi; + fi; +done + +# Stop cluster. +python vxquery-server/src/main/resources/scripts/cluster_cli.py -c vxquery-server/src/main/resources/conf/${CLUSTER}/${2}nodes.xml -a stop + +if which programname >/dev/null; +then + echo "Sending out e-mail notification." + SUBJECT="Benchmark Cluster Tests Finished" + EMAIL="[email protected]" + /bin/mail -s "${SUBJECT}" "${EMAIL}" <<EOM + Completed all tests in folder ${1} for a ${2} node cluster using ${HOSTNAME}. + EOM +else + echo "No mail command to use." +fi; http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_group_test.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_group_test.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_group_test.sh new file mode 100755 index 0000000..58976b7 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_group_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +DATASET="dataset-hcn-d2" +cluster_ip=${1} +base_weather_folder=${2} + +for n in 7 6 5 3 4 2 1 0 +do + #for t in "batch_scale_out" "speed_up" + for t in "batch_scale_out" + #for t in "speed_up" + do + for p in 2 + do + for c in 4 + do + echo " ==== node ${n} test ${t} partition ${p} cores ${c} ====" + sh vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh ${base_weather_folder}/${DATASET}/queries/${t}/${n}nodes/d2_p${p}/ ${n} "-client-net-ip-address ${cluster_ip} -available-processors ${c}" + done + done + done +done + +if which programname >/dev/null; +then + echo "Sending out e-mail notification." + SUBJECT="Benchmark Group Tests Finished" + EMAIL="[email protected]" + /bin/mail -s "${SUBJECT}" "${EMAIL}" <<EOM + Completed all tests in the predefined group for ${DATASET}. + EOM +else + echo "No mail command to use." +fi; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_mrql_tests.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_mrql_tests.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_mrql_tests.sh new file mode 100755 index 0000000..a6788be --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_mrql_tests.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export JAVA_HOME=/home/ecarm002/java/jdk1.6.0_45 +REPEAT=${1} +DATASET="hcn" + +for n in `seq 0 7` +#for n in 0 +do + date + echo "Running q0${n} on ${DATASET} for MRQL." + time for i in {1..${REPEAT}}; do ~/mrql/incubator-mrql/bin/mrql -dist -nodes 5 ~/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/other_systems/mrql_${DATASET}/q0${n}.mrql >> weather_data/mrql/query_logs/${DATASET}/q0${n}.mrql.log 2>&1; done; +done + +if which programname >/dev/null; +then + echo "Sending out e-mail notification." + SUBJECT="MRQL Tests Finished (${DATASET})" + EMAIL="[email protected]" + /bin/mail -s "${SUBJECT}" "${EMAIL}" <<EOM + Completed all MRQL tests on ${DATASET}. + EOM +else + echo "No mail command to use." +fi; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py new file mode 100644 index 0000000..4f81f86 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os.path +import linecache +import distutils.core +import fileinput +import socket + +from weather_config import * +from weather_data_files import * + +# Weather data files created to manage the conversion process. +# Allows partition and picking up where you left off. +# +# benchmark_name/ +# data/ +# queries/ +# logs/ +class WeatherBenchmark: + + DATA_LINKS_FOLDER = "data_links/" + LARGE_FILE_ROOT_TAG = WeatherDataFiles.LARGE_FILE_ROOT_TAG + QUERY_REPLACEMENT_KEY = "/tmp/1.0_partition_ghcnd_all_xml/" + QUERY_MASTER_FOLDER = "../queries/" + QUERY_FILE_LIST = [ + "q00.xq", + "q01.xq", + "q02.xq", + "q03.xq", + "q04.xq", + "q05.xq", + "q06.xq", + "q07.xq" + ] + QUERY_UTILITY_LIST = [ + "no_result.xq", + "sensor_count.xq", + "station_count.xq", + "q04_sensor.xq", + "q04_station.xq", + "q05_sensor.xq", + "q05_station.xq", + "q06_sensor.xq", + "q06_station.xq", + "q07_tmin.xq", + "q07_tmax.xq", + ] + BENCHMARK_LOCAL_TESTS = ["local_speed_up", "local_batch_scale_out"] + BENCHMARK_CLUSTER_TESTS = ["speed_up", "batch_scale_out"] + QUERY_COLLECTIONS = ["sensors", "stations"] + + SEPERATOR = "|" + + def __init__(self, base_paths, partitions, dataset, nodes): + self.base_paths = base_paths + self.partitions = partitions + self.dataset = dataset + self.nodes = nodes + + def print_partition_scheme(self): + if (len(self.base_paths) == 0): + return + for test in self.dataset.get_tests(): + if test in self.BENCHMARK_LOCAL_TESTS: + self.print_local_partition_schemes(test) + elif test in self.BENCHMARK_CLUSTER_TESTS: + self.print_cluster_partition_schemes(test) + else: + print "Unknown test." + exit() + + def print_local_partition_schemes(self, test): + node_index = 0 + virtual_disk_partitions = get_local_virtual_disk_partitions(self.partitions) + for p in self.partitions: + scheme = self.get_local_partition_scheme(test, p) + self.print_partition_schemes(virtual_disk_partitions, scheme, test, p, node_index) + + def print_cluster_partition_schemes(self, test): + node_index = self.get_current_node_index() + virtual_disk_partitions = get_cluster_virtual_disk_partitions(self.nodes, self.partitions) + for p in self.partitions: + scheme = self.get_cluster_partition_scheme(test, p) + self.print_partition_schemes(virtual_disk_partitions, scheme, test, p, node_index) + + def print_partition_schemes(self, virtual_partitions, scheme, test, partitions, node_id): + print + print "---------------- Partition Scheme --------------------" + print " Test: " + test + print " Virtual Partitions: " + str(virtual_partitions) + print " Disks: " + str(len(self.base_paths)) + print " Partitions: " + str(partitions) + print " Node Id: " + str(node_id) + + if isinstance(scheme, (tuple, list, dict, set)) and len(scheme) > 0: + folder_length = len(scheme[0][3]) + 5 + row_format = "{:>5} {:>5} {:>5} {:<" + str(folder_length) + "} {:<" + str(folder_length) + "}" + HEADER = ("Disk", "Index", "Link", "Data Path", "Link Path") + print row_format.format(*HEADER) + for row in scheme: + print row_format.format(*row) + print + else: + print " Scheme is EMPTY." + + def get_local_partition_scheme(self, test, partition): + scheme = [] + virtual_disk_partitions = get_local_virtual_disk_partitions(self.partitions) + data_schemes = get_disk_partition_scheme(0, virtual_disk_partitions, self.base_paths) + link_base_schemes = get_disk_partition_scheme(0, partition, self.base_paths, self.DATA_LINKS_FOLDER + test) + + # Match link paths to real data paths. + group_size = len(data_schemes) / len(link_base_schemes) + for d in range(len(self.base_paths)): + offset = 0 + for link_node, link_disk, link_virtual, link_index, link_path in link_base_schemes: + if d == link_disk: + # Only consider a single disk at a time. + for data_node, data_disk, data_virtual, data_index, data_path in data_schemes: + if test == "local_speed_up" and data_disk == link_disk \ + and offset <= data_index and data_index < offset + group_size: + scheme.append([data_disk, data_index, link_index, data_path, link_path]) + elif test == "local_batch_scale_out" and data_disk == link_disk \ + and data_index == link_index: + scheme.append([data_disk, data_index, link_index, data_path, link_path]) + offset += group_size + return scheme + + def get_cluster_partition_scheme(self, test, partition): + node_index = self.get_current_node_index() + if node_index == -1: + print "Unknown host." + return + + scheme = [] + virtual_disk_partitions = get_cluster_virtual_disk_partitions(self.nodes, self.partitions) + data_schemes = get_disk_partition_scheme(node_index, virtual_disk_partitions, self.base_paths) + link_base_schemes = get_cluster_link_scheme(len(self.nodes), partition, self.base_paths, self.DATA_LINKS_FOLDER + test) + + # Match link paths to real data paths. + for link_node, link_disk, link_virtual, link_index, link_path in link_base_schemes: + # Prep + if test == "speed_up": + group_size = virtual_disk_partitions / (link_node + 1) / partition + elif test == "batch_scale_out": + group_size = virtual_disk_partitions / len(self.nodes) / partition + else: + print "Unknown test." + return + + node_offset = group_size * node_index * partition + node_offset += group_size * link_index + has_data = True + if link_node < node_index: + has_data = False + + # Make links + for date_node, data_disk, data_virtual, data_index, data_path in data_schemes: + if has_data and data_disk == link_disk \ + and node_offset <= data_index and data_index < node_offset + group_size: + scheme.append([link_disk, data_index, link_index, data_path, link_path]) + scheme.append([link_disk, -1, link_index, "", link_path]) + return scheme + + def build_data_links(self, reset): + if (len(self.base_paths) == 0): + return + if reset: + shutil.rmtree(self.base_paths[0] + self.DATA_LINKS_FOLDER) + for test in self.dataset.get_tests(): + if test in self.BENCHMARK_LOCAL_TESTS: + for i in self.partitions: + scheme = self.get_local_partition_scheme(test, i) + self.build_data_links_scheme(scheme) + if 1 in self.partitions and len(self.base_paths) > 1: + scheme = self.build_data_links_local_zero_partition(test) + self.build_data_links_scheme(scheme) + elif test in self.BENCHMARK_CLUSTER_TESTS: + for i in self.partitions: + scheme = self.get_cluster_partition_scheme(test, i) + self.build_data_links_scheme(scheme) + if 1 in self.partitions and len(self.base_paths) > 1: + scheme = self.build_data_links_cluster_zero_partition(test) + self.build_data_links_scheme(scheme) + else: + print "Unknown test." + exit() + + def build_data_links_scheme(self, scheme): + '''Build all the data links based on the scheme information.''' + for (data_disk, data_index, partition, data_path, link_path) in scheme: + self.add_collection_links_for(data_path, link_path, data_index) + + def build_data_links_cluster_zero_partition(self, test): + '''Build a scheme for all data in one symbolically linked folder. (0 partition)''' + scheme = [] + link_base_schemes = get_cluster_link_scheme(len(self.nodes), 1, self.base_paths, self.DATA_LINKS_FOLDER + test) + for link_node, link_disk, link_virtual, link_index, link_path in link_base_schemes: + new_link_path = self.get_zero_partition_path(link_node, self.DATA_LINKS_FOLDER + test + "/" + str(link_node) + "nodes") + scheme.append([0, link_disk, 0, link_path, new_link_path]) + return scheme + + def build_data_links_local_zero_partition(self, test): + '''Build a scheme for all data in one symbolically linked folder. (0 partition)''' + scheme = [] + index = 0 + link_base_schemes = get_disk_partition_scheme(0, 1, self.base_paths, self.DATA_LINKS_FOLDER + test) + for link_node, link_disk, link_virtual, link_index, link_path in link_base_schemes: + if test == "local_batch_scale_out" and index > 0: + continue + new_link_path = self.get_zero_partition_path(link_node, self.DATA_LINKS_FOLDER + test) + scheme.append([0, index, 0, link_path, new_link_path]) + index += 1 + return scheme + + def get_zero_partition_path(self, node, key): + '''Return a partition path for the zero partition.''' + base_path = self.base_paths[0] + new_link_path = get_disk_partition_scheme(node, 1, [base_path], key)[0][PARTITION_INDEX_PATH] + return new_link_path.replace("p1", "p0") + + def get_current_node_index(self): + found = False + node_index = 0 + for machine in self.nodes: + if socket.gethostname().startswith(machine.get_node_name()): + found = True + break + node_index += 1 + + if found: + return node_index + else: + return -1 + + def add_collection_links_for(self, real_path, link_path, index): + for collection in self.QUERY_COLLECTIONS: + collection_path = link_path + collection + "/" + collection_index = collection_path + "index" + str(index) + if not os.path.isdir(collection_path): + os.makedirs(collection_path) + if index >= 0: + if os.path.islink(collection_index): + os.unlink(collection_index) + os.symlink(real_path + collection + "/", collection_index) + + def copy_query_files(self, reset): + for test in self.dataset.get_tests(): + if test in self.BENCHMARK_LOCAL_TESTS: + self.copy_local_query_files(test, reset) + elif test in self.BENCHMARK_CLUSTER_TESTS: + self.copy_cluster_query_files(test, reset) + else: + print "Unknown test." + exit() + + def copy_cluster_query_files(self, test, reset): + '''Determine the data_link path for cluster query files and copy with + new location for collection.''' + if 1 in self.partitions and len(self.base_paths) > 1: + for n in range(len(self.nodes)): + query_path = get_cluster_query_path(self.base_paths, test, 0, n) + prepare_path(query_path, reset) + + # Copy query files. + new_link_path = self.get_zero_partition_path(n, self.DATA_LINKS_FOLDER + test + "/" + str(n) + "nodes") + self.copy_and_replace_query(query_path, [new_link_path]) + for n in range(len(self.nodes)): + for p in self.partitions: + query_path = get_cluster_query_path(self.base_paths, test, p, n) + prepare_path(query_path, reset) + + # Copy query files. + partition_paths = get_disk_partition_paths(n, p, self.base_paths, self.DATA_LINKS_FOLDER + test + "/" + str(n) + "nodes") + self.copy_and_replace_query(query_path, partition_paths) + + def copy_local_query_files(self, test, reset): + '''Determine the data_link path for local query files and copy with + new location for collection.''' + if 1 in self.partitions and len(self.base_paths) > 1: + query_path = get_local_query_path(self.base_paths, test, 0) + prepare_path(query_path, reset) + + # Copy query files. + new_link_path = self.get_zero_partition_path(0, self.DATA_LINKS_FOLDER + test) + self.copy_and_replace_query(query_path, [new_link_path]) + for p in self.partitions: + query_path = get_local_query_path(self.base_paths, test, p) + prepare_path(query_path, reset) + + # Copy query files. + partition_paths = get_disk_partition_paths(0, p, self.base_paths, self.DATA_LINKS_FOLDER + test) + self.copy_and_replace_query(query_path, partition_paths) + + def copy_and_replace_query(self, query_path, replacement_list): + '''Copy the query files over to the query_path and replace the path + for the where the collection data is located.''' + for query_file in self.QUERY_FILE_LIST + self.QUERY_UTILITY_LIST: + shutil.copyfile(self.QUERY_MASTER_FOLDER + query_file, query_path + query_file) + + # Make a search replace for each collection. + for collection in self.QUERY_COLLECTIONS: + replacement_list_with_type = [] + for replace in replacement_list: + replacement_list_with_type.append(replace + collection) + + replace_string = self.SEPERATOR.join(replacement_list_with_type) + for line in fileinput.input(query_path + query_file, True): + sys.stdout.write(line.replace(self.QUERY_REPLACEMENT_KEY + collection, replace_string)) + + # Make a search replace for partition type. + if self.dataset.get_partition_type() == "large_files": + for line in fileinput.input(query_path + query_file, True): + sys.stdout.write(line.replace("/stationCollection", "/" + self.LARGE_FILE_ROOT_TAG + "/stationCollection")) + for line in fileinput.input(query_path + query_file, True): + sys.stdout.write(line.replace("/dataCollection", "/" + self.LARGE_FILE_ROOT_TAG + "/dataCollection")) + + def get_number_of_slices_per_disk(self): + if len(self.dataset.get_tests()) == 0: + print "No test has been defined in config file." + else: + for test in self.dataset.get_tests(): + if test in self.BENCHMARK_LOCAL_TESTS: + return get_local_virtual_disk_partitions(self.partitions) + elif test in self.BENCHMARK_CLUSTER_TESTS: + return get_cluster_virtual_disk_partitions(self.nodes, self.partitions) + else: + print "Unknown test." + exit() + +def get_cluster_link_scheme(nodes, partition, base_paths, key="partitions"): + link_paths = [] + for n in range(0, nodes): + new_link_path = get_disk_partition_scheme(n, partition, base_paths, key + "/" + str(n) + "nodes") + link_paths.extend(new_link_path) + return link_paths + +def get_local_query_path(base_paths, test, partition): + return base_paths[0] + "queries/" + test + "/" + get_local_query_folder(len(base_paths), partition) + "/" + +def get_local_query_folder(disks, partitions): + return "d" + str(disks) + "_p" + str(partitions) + +def get_cluster_query_path(base_paths, test, partition, nodes): + return base_paths[0] + "queries/" + test + "/" + str(nodes) + "nodes/" + get_local_query_folder(len(base_paths), partition) + "/" + +def get_cluster_virtual_disk_partitions(nodes, partitions): + vp = get_local_virtual_disk_partitions(partitions) + vn = calculate_partitions(range(1, len(nodes)+1, 1)) + return vp * vn + +def get_local_virtual_disk_partitions(partitions): + return calculate_partitions(partitions) + +def calculate_partitions(list): + x = 1 + for i in list: + if x % i != 0: + if i % x == 0: + x = i + else: + x *= i + return x http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py new file mode 100644 index 0000000..eeae25c --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys, getopt + +# Custom modules. +from weather_data_files import * +from weather_download_files import * +from weather_convert_to_xml import * +from weather_config import * +from weather_benchmark import * + +DEBUG_OUTPUT = False + +# +# Weather conversion for GHCN-DAILY files to xml. +# +# http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt +# +def main(argv): + append = False + max_records = 0 + process_file_name = "" + reset = False + section = "all" + token = "" + update = False + xml_config_path = "" + + try: + opts, args = getopt.getopt(argv, "af:hl:m:ruvw:x:", ["file=", "locality=", "max_station_files=", "web_service=", "xml_config="]) + except getopt.GetoptError: + print 'The file options for weather_cli.py were not correctly specified.' + print 'To see a full list of options try:' + print ' $ python weather_cli.py -h' + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print 'Converting weather daily files to xml options:' + print ' -a Append the results to the progress file.' + print ' -f (str) The file name of a specific station to process.' + print ' * Helpful when testing a single stations XML file output.' + print ' -l (str) Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, partition_scheme, test_links, queries, inventory, statistics).' + print ' -m (int) Limits the number of files created for each station.' + print ' * Helpful when testing to make sure all elements are supported for each station.' + print ' Alternate form: --max_station_files=(int)' + print ' -r Reset the build process. (For one section or all sections depending on other parameters.)' + print ' -u Recalculate the file count and data size for each data source file.' + print ' -v Extra debug information.' + print ' -w (str) Downloads the station XML file form the web service.' + print ' -x (str) XML config file for weather data.' + sys.exit() + elif opt in ('-a', "--append"): + append = True + elif opt in ('-f', "--file"): + # check if file exists. + if os.path.exists(arg): + process_file_name = arg + else: + print 'Error: Argument must be a file name for --file (-f).' + sys.exit() + elif opt in ('-l', "--locality"): + if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "partition_scheme", "test_links", "queries", "inventory", "statistics"): + section = arg + else: + print 'Error: Argument must be a string for --locality (-l) and a valid locality.' + sys.exit() + elif opt in ('-m', "--max_station_files"): + if arg.isdigit(): + max_records = int(arg) + else: + print 'Error: Argument must be an integer for --max_station_files (-m).' + sys.exit() + elif opt == '-r': + reset = True + elif opt == '-u': + update = True + elif opt == '-v': + global DEBUG_OUTPUT + DEBUG_OUTPUT = True + elif opt == '-w': + # check if file exists. + if arg is not "": + token = arg + else: + print 'Error: Argument must be a string --web_service (-w).' + sys.exit() + elif opt in ('-x', "--xml_config"): + # check if file exists. + if os.path.exists(arg): + xml_config_path = arg + else: + print 'Error: Argument must be a xml file for --xml_config (-x).' + sys.exit() + + # Required fields to run the script. + if xml_config_path == "" or not os.path.exists(xml_config_path): + print 'Error: The xml config option must be supplied: --xml_config (-x).' + sys.exit() + config = WeatherConfig(xml_config_path) + + # Required fields to run the script. + if config.get_save_path() == "" or not os.path.exists(config.get_save_path()): + print 'Error: The save directory option must be supplied in the config file.' + sys.exit() + + # Set up downloads folder. + download_path = config.get_save_path() + "/downloads" + if section in ("all", "download"): + print 'Processing the download section.' + download = WeatherDownloadFiles(download_path) + download.download_ghcnd_files(reset) + download.download_mshr_files(reset) + + # Unzip the required file. + download.unzip_ghcnd_package(config.get_package(), reset) + download.unzip_mshr_files(reset) + + + # Create some basic paths for save files and references. + ghcnd_data_dly_path = download_path + '/' + config.get_package() + '/' + config.get_package() + xml_data_save_path = config.get_save_path() + '/all_xml_files/' + + # Make sure the xml folder is available. + if not os.path.isdir(xml_data_save_path): + os.makedirs(xml_data_save_path) + + # Set up the XML build objects. + convert = WeatherWebServiceMonthlyXMLFile(download_path, xml_data_save_path, DEBUG_OUTPUT) + progress_file = xml_data_save_path + "_data_progress.csv" + data = WeatherDataFiles(ghcnd_data_dly_path, progress_file) + if section in ("all", "progress_file"): + print 'Processing the progress_file section.' + options = list() + if append: + options.append('append') + if update: + options.append('recalculate') + if reset: + options.append('reset') + data.build_progress_file(options, convert) + + if section in ("all", "sensor_build"): + print 'Processing the sensor_build section.' + if process_file_name is not "": + # process a single file + if os.path.exists(process_file_name): + (file_count, data_size) = convert.process_sensor_file(process_file_name, max_records, 4) + data.update_file_sensor_status(process_file_name, WeatherDataFiles.DATA_FILE_GENERATED, file_count, data_size) + else: + data.update_file_sensor_status(process_file_name, WeatherDataFiles.DATA_FILE_MISSING) + else: + # process directory + data.reset() + data.set_type("sensor") + data.set_data_reset(reset) + for file_name in data: + file_path = ghcnd_data_dly_path + '/' + file_name + if os.path.exists(file_path): + (file_count, data_size) = convert.process_sensor_file(file_path, max_records, 4) + data.update_file_sensor_status(file_name, WeatherDataFiles.DATA_FILE_GENERATED, file_count, data_size) + else: + data.update_file_sensor_status(file_name, WeatherDataFiles.DATA_FILE_MISSING) + + if section in ("all", "station_build"): + print 'Processing the station_build section.' + data.reset() + data.set_type("station") + data.set_data_reset(reset) + if token is not "": + convert.set_token(token) + for file_name in data: + file_path = ghcnd_data_dly_path + '/' + file_name + if os.path.exists(file_path): + return_status = convert.process_station_file(file_path) + status = data.get_station_status(return_status) + data.update_file_station_status(file_name, status) + else: + data.update_file_station_status(file_name, WeatherDataFiles.DATA_FILE_MISSING) + + for dataset in config.get_dataset_list(): + # Set up the setting for each dataset. + dataset_folder = "/dataset-" + dataset.get_name() + progress_file = config.get_save_path() + dataset_folder + "/_data_progress.csv" + data = WeatherDataFiles(ghcnd_data_dly_path, progress_file) + + base_paths = [] + for paths in dataset.get_save_paths(): + base_paths.append(paths + dataset_folder + "/") + benchmark = WeatherBenchmark(base_paths, dataset.get_partitions(), dataset, config.get_node_machine_list()) + + if section in ("all", "partition", "partition_scheme"): + slices = benchmark.get_number_of_slices_per_disk() + print 'Processing the partition section (' + dataset.get_name() + ':d' + str(len(base_paths)) + ':s' + str(slices) + ').' + data.reset() + if section == "partition_scheme": + benchmark.print_partition_scheme() + else: + if dataset.get_partition_type() == "large_files": + data.build_to_n_partition_files(xml_data_save_path, slices, base_paths, reset) + else: + data.copy_to_n_partitions(xml_data_save_path, slices, base_paths, reset) + + if section in ("all", "test_links"): + # TODO determine current node + print 'Processing the test links section (' + dataset.get_name() + ').' + benchmark.print_partition_scheme() + benchmark.build_data_links(reset) + + if section in ("all", "queries"): + print 'Processing the queries section (' + dataset.get_name() + ').' + benchmark.copy_query_files(reset) + + if section in ("inventory"): + print 'Processing the inventory section.' + convert.process_inventory_file() + +# if section in ("statistics"): +# print 'Processing the statistics section.' +# data.print_progress_file_stats(convert) + +if __name__ == "__main__": + main(sys.argv[1:]) http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py new file mode 100644 index 0000000..80607b8 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from xml.dom.minidom import parse + +class WeatherConfig: + def __init__(self, config_xml_file): + self.config_xml_file = config_xml_file + + self.config = parse(self.config_xml_file) + + def get_save_path(self): + return self.get_text(self.config.getElementsByTagName("save_path")[0]) + + def get_package(self): + return self.get_text(self.config.getElementsByTagName("package")[0]) + + def get_node_machine_list(self): + nodes = [] + for node in self.config.getElementsByTagName("node"): + id = self.get_node_name(node) + ip = self.get_node_ip(node) + nodes.append(Machine(id, ip)) + return nodes + + def get_dataset_list(self): + nodes = [] + for node in self.config.getElementsByTagName("dataset"): + name = self.get_dataset_name(node) + save_paths = self.get_dataset_save_paths(node) + partition_type = self.get_dataset_partition_type(node) + partitions = self.get_dataset_partitions(node) + tests = self.get_dataset_tests(node) + nodes.append(Dataset(name, save_paths, partition_type, partitions, tests)) + return nodes + + + # -------------------------------------------------------------------------- + # Node Specific Functions + # -------------------------------------------------------------------------- + def get_node_ip(self, node): + return self.get_text(node.getElementsByTagName("cluster_ip")[0]) + + def get_node_name(self, node): + return self.get_text(node.getElementsByTagName("id")[0]) + + + # -------------------------------------------------------------------------- + # Dataset Specific Functions + # -------------------------------------------------------------------------- + def get_dataset_name(self, node): + return self.get_text(node.getElementsByTagName("name")[0]) + + def get_dataset_save_paths(self, node): + paths = [] + for item in node.getElementsByTagName("save_path"): + paths.append(self.get_text(item)) + return paths + + def get_dataset_partition_type(self, node): + return self.get_text(node.getElementsByTagName("partition_type")[0]) + + def get_dataset_partitions(self, node): + paths = [] + for item in node.getElementsByTagName("partitions_per_path"): + paths.append(int(self.get_text(item))) + return paths + + def get_dataset_tests(self, node): + tests = [] + for item in node.getElementsByTagName("test"): + tests.append(self.get_text(item)) + return tests + + def get_text(self, xml_node): + rc = [] + for node in xml_node.childNodes: + if node.nodeType == node.TEXT_NODE: + rc.append(node.data) + return ''.join(rc) + +class Machine: + def __init__(self, id, ip): + self.id = id + self.ip = ip + + def get_node_name(self): + return self.id + + def get_node_ip(self): + return self.ip + + def __repr__(self): + return self.id + "(" + self.ip + ")" + +class Dataset: + def __init__(self, name, save_paths, partition_type, partitions, tests): + self.name = name + self.save_paths = save_paths + self.partitions = partitions + self.partition_type = partition_type + self.tests = tests + + def get_name(self): + return self.name + + def get_save_paths(self): + return self.save_paths + + def get_partitions(self): + return self.partitions + + def get_partition_type(self): + return self.partition_type + + def get_tests(self): + return self.tests + + def __repr__(self): + return self.name + ":" + str(self.save_paths) + ":" + str(self.partitions) + http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py new file mode 100644 index 0000000..04fff52 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Base URL used to get all the required files. +BASE_DOWNLOAD_URL = 'http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/' + +# List of required files for a build. +FILE_NAMES = [] +FILE_NAMES.append('ghcnd-countries.txt') +FILE_NAMES.append('ghcnd-inventory.txt') +FILE_NAMES.append('ghcnd-states.txt') +FILE_NAMES.append('ghcnd-stations.txt') +FILE_NAMES.append('ghcnd-version.txt') +FILE_NAMES.append('ghcnd_all.tar.gz') +FILE_NAMES.append('ghcnd_gsn.tar.gz') +FILE_NAMES.append('ghcnd_hcn.tar.gz') +FILE_NAMES.append('readme.txt') +FILE_NAMES.append('status.txt') + +# Store the row details here. + +# Index values of each field details. +FIELD_INDEX_NAME = 0 +FIELD_INDEX_START = 1 +FIELD_INDEX_END = 2 +FIELD_INDEX_TYPE = 3 + +DLY_FIELD_ID = 0 +DLY_FIELD_YEAR = 1 +DLY_FIELD_MONTH = 2 +DLY_FIELD_ELEMENT = 3 + +DLY_FIELD_DAY_OFFSET = 4 +DLY_FIELD_DAY_FIELDS = 4 + +DLY_FIELDS = [] + +# Details about the row. +DLY_FIELDS.append(['ID', 1, 11, 'Character']) +DLY_FIELDS.append(['YEAR', 12, 15, 'Integer']) +DLY_FIELDS.append(['MONTH', 16, 17, 'Integer']) +DLY_FIELDS.append(['ELEMENT', 18, 21, 'Character']) + +# Days in each row. +for i in range(1, 32): + start = 22 + ((i - 1) * 8) + DLY_FIELDS.append(['VALUE' + str(i), (start + 0), (start + 4), 'Integer']) + DLY_FIELDS.append(['MFLAG' + str(i), (start + 5), (start + 5), 'Character']) + DLY_FIELDS.append(['QFLAG' + str(i), (start + 6), (start + 6), 'Character']) + DLY_FIELDS.append(['SFLAG' + str(i), (start + 7), (start + 7), 'Character']) + +# Details about the row. +STATIONS_FIELDS = {} +STATIONS_FIELDS['ID'] = ['ID', 1, 11, 'Character'] +STATIONS_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real'] +STATIONS_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real'] +STATIONS_FIELDS['ELEVATION'] = ['ELEVATION', 32, 37, 'Real'] +STATIONS_FIELDS['STATE'] = ['STATE', 39, 40, 'Character'] +STATIONS_FIELDS['NAME'] = ['NAME', 42, 71, 'Character'] +STATIONS_FIELDS['GSNFLAG'] = ['GSNFLAG', 73, 75, 'Character'] +STATIONS_FIELDS['HCNFLAG'] = ['HCNFLAG', 77, 79, 'Character'] +STATIONS_FIELDS['WMOID'] = ['WMOID', 81, 85, 'Character'] + +# Details about the row. +COUNTRIES_FIELDS = {} +COUNTRIES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] +COUNTRIES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] + +# Details about the row. +STATES_FIELDS = {} +STATES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] +STATES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] + +# Details about the row. +INVENTORY_FIELDS = {} +INVENTORY_FIELDS['ID'] = ['ID', 1, 11, 'Character'] +INVENTORY_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real'] +INVENTORY_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real'] +INVENTORY_FIELDS['ELEMENT'] = ['ELEMENT', 32, 35, 'Character'] +INVENTORY_FIELDS['FIRSTYEAR'] = ['FIRSTYEAR', 37, 40, 'Integer'] +INVENTORY_FIELDS['LASTYEAR'] = ['LASTYEAR', 42, 45, 'Integer'] http://git-wip-us.apache.org/repos/asf/vxquery/blob/c182925c/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py new file mode 100644 index 0000000..7b1434f --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# List of required files for a build. +MSHR_URLS = [] +MSHR_URLS.append('ftp://ftp.ncdc.noaa.gov/pub/data/homr/docs/MSHR_Enhanced_Table.txt') +MSHR_URLS.append('http://www.ncdc.noaa.gov/homr/file/mshr_enhanced.txt.zip') + +# Index values of each field details. +MSHR_FIELD_INDEX_NAME = 0 +MSHR_FIELD_INDEX_START = 1 +MSHR_FIELD_INDEX_END = 2 +MSHR_FIELD_INDEX_TYPE = 3 + +# Store the row details here. +MSHR_FIELDS = {} + +# Details about the row. +MSHR_FIELDS['SOURCE_ID'] = ['SOURCE_ID', 1, 20, 'X(20)'] +MSHR_FIELDS['SOURCE'] = ['SOURCE', 22, 31, 'X(10)'] +MSHR_FIELDS['BEGIN_DATE'] = ['BEGIN_DATE', 33, 40, 'YYYYMMDD'] +MSHR_FIELDS['END_DATE'] = ['END_DATE', 42, 49, 'YYYYMMDD'] +MSHR_FIELDS['STATION_STATUS'] = ['STATION_STATUS', 51, 70, 'X(20)'] +MSHR_FIELDS['NCDCSTN_ID'] = ['NCDCSTN_ID', 72, 91, 'X(20)'] +MSHR_FIELDS['ICAO_ID'] = ['ICAO_ID', 93, 112, 'X(20)'] +MSHR_FIELDS['WBAN_ID'] = ['WBAN_ID', 114, 133, 'X(20)'] +MSHR_FIELDS['FAA_ID'] = ['FAA_ID', 135, 154, 'X(20)'] +MSHR_FIELDS['NWSLI_ID'] = ['NWSLI_ID', 156, 175, 'X(20)'] +MSHR_FIELDS['WMO_ID'] = ['WMO_ID', 177, 196, 'X(20)'] +MSHR_FIELDS['COOP_ID'] = ['COOP_ID', 198, 217, 'X(20)'] +MSHR_FIELDS['TRANSMITTAL_ID'] = ['TRANSMITTAL_ID', 219, 238, 'X(20)'] +MSHR_FIELDS['GHCND_ID'] = ['GHCND_ID', 240, 259, 'X(20)'] +MSHR_FIELDS['NAME_PRINCIPAL'] = ['NAME_PRINCIPAL', 261, 360, 'X(100)'] +MSHR_FIELDS['NAME_PRINCIPAL_SHORT'] = ['NAME_PRINCIPAL_SHORT', 362, 391, 'X(30)'] +MSHR_FIELDS['NAME_COOP'] = ['NAME_COOP', 393, 492, 'X(100)'] +MSHR_FIELDS['NAME_COOP_SHORT'] = ['NAME_COOP_SHORT', 494, 523, 'X(30)'] +MSHR_FIELDS['NAME_PUBLICATION'] = ['NAME_PUBLICATION', 525, 624, 'X(100)'] +MSHR_FIELDS['NAME_ALIAS'] = ['NAME_ALIAS', 626, 725, 'X(100)'] +MSHR_FIELDS['NWS_CLIM_DIV'] = ['NWS_CLIM_DIV', 727, 736, 'X(10)'] +MSHR_FIELDS['NWS_CLIM_DIV_NAME'] = ['NWS_CLIM_DIV_NAME', 738, 777, 'X(40)'] +MSHR_FIELDS['STATE_PROV'] = ['STATE_PROV', 779, 788, 'X(10)'] +MSHR_FIELDS['COUNTY'] = ['COUNTY', 790, 839, 'X(50)'] +MSHR_FIELDS['NWS_ST_CODE'] = ['NWS_ST_CODE', 841, 842, 'X(2)'] +MSHR_FIELDS['FIPS_COUNTRY_CODE'] = ['FIPS_COUNTRY_CODE', 844, 845, 'X(2)'] +MSHR_FIELDS['FIPS_COUNTRY_NAME'] = ['FIPS_COUNTRY_NAME', 847, 946, 'X(100)'] +MSHR_FIELDS['NWS_REGION'] = ['NWS_REGION', 948, 977, 'X(30)'] +MSHR_FIELDS['NWS_WFO'] = ['NWS_WFO', 979, 988, 'X(10)'] +MSHR_FIELDS['ELEV_GROUND'] = ['ELEV_GROUND', 990, 1029, 'X(40)'] +MSHR_FIELDS['ELEV_GROUND_UNIT'] = ['ELEV_GROUND_UNIT', 1031, 1050, 'X(20)'] +MSHR_FIELDS['ELEV_BAROM'] = ['ELEV_BAROM', 1052, 1091, 'X(40)'] +MSHR_FIELDS['ELEV_BAROM_UNIT'] = ['ELEV_BAROM_UNIT', 1093, 1112, 'X(20)'] +MSHR_FIELDS['ELEV_AIR'] = ['ELEV_AIR', 1114, 1153, 'X(40)'] +MSHR_FIELDS['ELEV_AIR_UNIT'] = ['ELEV_AIR_UNIT', 1155, 1174, 'X(20)'] +MSHR_FIELDS['ELEV_ZERODAT'] = ['ELEV_ZERODAT', 1176, 1215, 'X(40)'] +MSHR_FIELDS['ELEV_ZERODAT_UNIT'] = ['ELEV_ZERODAT_UNIT', 1217, 1236, 'X(20)'] +MSHR_FIELDS['ELEV_UNK'] = ['ELEV_UNK', 1238, 1277, 'X(40)'] +MSHR_FIELDS['ELEV_UNK_UNIT'] = ['ELEV_UNK_UNIT', 1279, 1298, 'X(20)'] +MSHR_FIELDS['LAT_DEC'] = ['LAT_DEC', 1300, 1319, 'X(20)'] +MSHR_FIELDS['LON_DEC'] = ['LON_DEC', 1321, 1340, 'X(20)'] +MSHR_FIELDS['LAT_LON_PRECISION'] = ['LAT_LON_PRECISION', 1342, 1351, 'X(10)'] +MSHR_FIELDS['RELOCATION'] = ['RELOCATION', 1353, 1414, 'X(62)'] +MSHR_FIELDS['UTC_OFFSET'] = ['UTC_OFFSET', 1416, 1431, '9(16)'] +MSHR_FIELDS['OBS_ENV'] = ['OBS_ENV', 1433, 1472, 'X(40) '] +MSHR_FIELDS['PLATFORM'] = ['PLATFORM', 1474, 1573, 'X(100)']
