noaa-ghcn-daily.scripts.weather_convert_to_xml.py Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of apache-vxquery-benchmark
Apache VXQuery Benchmark
The newest version!
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import textwrap
from datetime import date
import os
from collections import OrderedDict

# Custom modules.
from weather_config_ghcnd import *
from weather_config_mshr import *
from weather_download_files import *

class WeatherConvertToXML:
    
    STATES = OrderedDict({
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
    })
    
    MONTHS = [
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December"
    ]
    
    token = ""
    
    def __init__(self, base_path, save_path, debug_output):
        self.save_path = save_path
        self.debug_output = debug_output

        # Extra support files.
        self.ghcnd_countries = base_path + '/ghcnd-countries.txt'
        self.ghcnd_inventory = base_path + '/ghcnd-inventory.txt'
        self.ghcnd_states = base_path + '/ghcnd-states.txt'
        self.ghcnd_stations = base_path + '/ghcnd-stations.txt'
        
        # MSHR support files.
        self.mshr_stations = base_path + '/mshr_enhanced_201402.txt'
        
    def set_token(self, token):
        self.token = token
        
    def get_field_from_definition(self, row, field_definition):
        return row[(field_definition[FIELD_INDEX_START] - 1):field_definition[FIELD_INDEX_END]]
    
    def get_field(self, fields_array, row, index):
        return row[(fields_array[index][FIELD_INDEX_START] - 1):fields_array[index][FIELD_INDEX_END]]
    
    def get_dly_field(self, row, index):
        return self.get_field(DLY_FIELDS, row, index)
    
    def print_row_files(self, row):
        for field in DLY_FIELDS:
            print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
    
    def save_file(self, filename, contents):
        file = open(filename, 'w')
        file.write(contents)
        file.close()
        return filename
    
    def get_folder_size(self, folder_name):
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(folder_name):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                total_size += os.path.getsize(fp)
        return total_size

    def process_one_month_sensor_set(self, records, page):
        # Default
        return 0
    
    def process_station_data(self, row):
        # Default
        return 0
    
    def get_base_folder(self, station_id, data_type="sensors"):
        return build_base_save_folder(self.save_path, station_id, data_type) 
    
    def process_inventory_file(self):
        print "Processing inventory file"
        file_stream = open(self.ghcnd_inventory, 'r')
        
        csv_header = ['ID', 'SENSORS', 'SENSORS_COUNT', 'MAX_YEARS', 'TOTAL_YEARS_FOR_ALL_SENSORS']
        row = file_stream.readline()
        csv_inventory = {}
        for row in file_stream:
            id = self.get_field_from_definition(row, INVENTORY_FIELDS['ID'])
            sensor_id = self.get_field_from_definition(row, INVENTORY_FIELDS['ELEMENT'])
            start = int(self.get_field_from_definition(row, INVENTORY_FIELDS['FIRSTYEAR']))
            end = int(self.get_field_from_definition(row, INVENTORY_FIELDS['LASTYEAR']))
            if id in csv_inventory:
                new_count = str(int(csv_inventory[id][2]) + 1)
                new_max = str(max(int(csv_inventory[id][3]), (end - start)))
                new_total = str(int(csv_inventory[id][3]) + end - start)
                csv_inventory[id] = [id, (csv_inventory[id][1] + "," + sensor_id), new_count, new_max, new_total]
            else:
                csv_inventory[id] = [id, sensor_id, str(1), str(end - start), str(end - start)]
                
        path = self.save_path + "/inventory.csv"
        self.save_csv_file(path, csv_inventory, csv_header)
    
    def save_csv_file(self, path, csv_inventory, header):
        csv_content = "|".join(header) + "\n"
        for row_id in csv_inventory:
            csv_content += "|".join(csv_inventory[row_id]) + "\n"
        self.save_file(path, csv_content)
        

    def process_station_file(self, file_name):
        print "Processing station file: " + file_name
        file_stream = open(file_name, 'r')
        
        row = file_stream.readline()
        return self.process_station_data(row)

    def process_sensor_file(self, file_name, max_files, sensor_max=99):
        print "Processing sensor file: " + file_name
        file_stream = open(file_name, 'r')
    
        month_last = 0
        year_last = 0
        records = []
        page = 0
        sensor_count = 0
    
        file_count = 0
        for row in file_stream:
            month = self.get_dly_field(row, DLY_FIELD_MONTH)
            year = self.get_dly_field(row, DLY_FIELD_YEAR)
            
            if (month_last != 0 and year_last != 0) and (sensor_count >= sensor_max or month != month_last or year != year_last):
                # process set
                file_count += self.process_one_month_sensor_set(records, page)
                records = []
                if sensor_count >= sensor_max and month == month_last and year == year_last:
                    # start a new page.
                    page += 1
                else:
                    # start over.
                    page = 0
                sensor_count = 0
            
            records.append(row)
            sensor_count += 1
            if max_files != 0 and file_count >= max_files:
                # Stop creating more files after the max is reached.
                break

            month_last = month
            year_last = year
        
        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
        data_size = self.get_folder_size(self.get_base_folder(station_id) + "/" + station_id)
        print "Created " + str(file_count) + " XML files for a data size of " + str(data_size) + "."
        
        return (file_count, data_size)
    
    def convert_c2f(self, c):
        return (9 / 5 * c) + 32
    
    def default_xml_web_service_start(self):
        field_xml = ""
        field_xml += "\n"
        return field_xml
    
    def default_xml_data_start(self, total_records):
        field_xml = ""
        field_xml += "\n"
        return field_xml
    
    def default_xml_station_start(self):
        field_xml = ""
        field_xml = "\n"
        return field_xml
    
    def default_xml_field_date(self, report_date, indent=2):
        field_xml = ""
        field_xml += self.get_indent_space(indent) + "" + str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + str(report_date.day).zfill(2) + "T00:00:00.000\n"
        return field_xml
    
    def default_xml_mshr_station_additional(self, station_id):
        """The web service station data is generate from the MSHR data supplemented with GHCN-Daily."""
        station_mshr_row = ""
        stations_mshr_file = open(self.mshr_stations, 'r')
        for line in stations_mshr_file:
            if station_id == self.get_field_from_definition(line, MSHR_FIELDS['GHCND_ID']).strip():
                station_mshr_row = line
                break
        
        if station_mshr_row == "":
            return ""

        additional_xml = ""

        county = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['COUNTY']).strip()
        if county != "":
            additional_xml += self.default_xml_location_labels("CNTY", "FIPS:-9999", county)
            
        country_code = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip()
        country_name = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip()
        if country_code != "" and country_name != "":
            additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:" + country_code, country_name)
        
        return additional_xml

    def default_xml_location_labels(self, type, id, display_name):
        label_xml = ""
        label_xml += self.default_xml_start_tag("locationLabels", 2)
        label_xml += self.default_xml_element("type", type, 3)
        label_xml += self.default_xml_element("id", id, 3)
        label_xml += self.default_xml_element("displayName", display_name, 3)
        label_xml += self.default_xml_end_tag("locationLabels", 2)
        return label_xml
        

    def default_xml_web_service_station(self, station_id):
        """The web service station data is generate from available historical sources."""
        station_ghcnd_row = ""
        stations_ghcnd_file = open(self.ghcnd_stations, 'r')
        for line in stations_ghcnd_file:
            if station_id == self.get_field_from_definition(line, STATIONS_FIELDS['ID']):
                station_ghcnd_row = line
                break
    
        xml_station = ""
        xml_station += self.default_xml_start_tag("station", 1)
        
        xml_station += self.default_xml_element("id", "GHCND:" + station_id, 2)
        xml_station += self.default_xml_element("displayName", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['NAME']).strip(), 2)
        xml_station += self.default_xml_element("latitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LATITUDE']).strip(), 2)
        xml_station += self.default_xml_element("longitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LONGITUDE']).strip(), 2)
        
        elevation = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['ELEVATION']).strip()
        if elevation != "-999.9":
            xml_station += self.default_xml_element("elevation", elevation, 2)
        
        state_code = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['STATE']).strip()
        if state_code != "" and state_code in self.STATES:
            xml_station += self.default_xml_location_labels("ST", "FIPS:" + str(self.STATES.keys().index(state_code)), self.STATES[state_code])
        
        # Add the MSHR data to the station generated information.
        xml_station += self.default_xml_mshr_station_additional(station_id)
            
        xml_station += self.default_xml_end_tag("station", 1)
        return xml_station
        
    def default_xml_day_reading_as_field(self, row, day):
        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
        value = self.get_dly_field(row, day_index);
        if value == "-9999":
            return ""
    
        field_xml = ""
        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
        if field_id in ("MDTN", "MDTX", "MNPN", "MXPN", "TMAX", "TMIN", "TOBS",):
            # Add both the celcius and fahrenheit temperatures.
            celcius = float(value) / 10
            field_xml += "            <" + field_id + "_c>" + str(celcius) + "\n"
            fahrenheit = self.convert_c2f(celcius)
            field_xml += "            <" + field_id + "_f>" + str(fahrenheit) + "\n"
        elif field_id in ("AWND", "EVAP", "PRCP", "THIC", "WESD", "WESF", "WSF1", "WSF2", "WSF5", "WSFG", "WSFI", "WSFM",):
            # Field values that are in tenths.
            converted_value = float(value) / 10
            field_xml += "            <" + field_id + ">" + str(converted_value) + "\n"
        elif field_id in ("ACMC", "ACMH", "ACSC", "ACSH", "PSUN",):
            # Fields is a percentage.
            field_xml += "            <" + field_id + ">" + value.strip() + "\n"
        elif field_id in ("FMTM", "PGTM",):
            # Fields is a time value HHMM.
            field_xml += "            <" + field_id + ">" + value.strip() + "\n"
        elif field_id in ("DAEV", "DAPR", "DASF", "DATN", "DATX", "DAWM", "DWPR", "FRGB", "FRGT", "FRTH", "GAHT", "MDSF", "MDWM", "MDEV", "MDPR", "SNOW", "SNWD", "TSUN", "WDF1", "WDF2", "WDF5", "WDFG", "WDFI", "WDFM", "WDMV",):
            # Fields with no alternation needed.
            field_xml += "            <" + field_id + ">" + value.strip() + "\n"
        else:
            field_xml += "            " + field_id + "\n"
            
        # print field_xml
        return field_xml
    
    def default_xml_day_reading(self, row, day, indent=2):
        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
        value = self.get_dly_field(row, day_index);
        mflag = self.get_dly_field(row, day_index + 1);
        qflag = self.get_dly_field(row, day_index + 2);
        sflag = self.get_dly_field(row, day_index + 3);

        if value == "-9999":
            return ""

        indent_space = self.get_indent_space(indent)
        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
        station_id = "GHCND:" + self.get_dly_field(row, DLY_FIELD_ID)
    
        field_xml = ""
        field_xml += indent_space + "" + field_id + "\n"
        field_xml += indent_space + "" + station_id + "\n"
        field_xml += indent_space + "" + value.strip() + "\n"
        field_xml += indent_space + "\n"
        field_xml += indent_space + indent_space + "" + mflag.strip() + "\n"
        field_xml += indent_space + indent_space + "" + qflag.strip() + "\n"
        field_xml += indent_space + indent_space + "" + sflag.strip() + "\n"
        field_xml += indent_space + indent_space + "\n"
        field_xml += indent_space + "\n"

        # print field_xml
        return field_xml
    
    def default_xml_end(self):
        return textwrap.dedent("""\
            """)

    def default_xml_data_end(self):
        return self.default_xml_end_tag("dataCollection", 0)

    def default_xml_station_end(self):
        return self.default_xml_end_tag("stationCollection", 0)

    def default_xml_element(self, tag, data, indent=1):
        return self.get_indent_space(indent) + "<" + tag + ">" + data + "\n"

    def default_xml_start_tag(self, tag, indent=1):
        return self.get_indent_space(indent) + "<" + tag + ">\n"

    def default_xml_end_tag(self, tag, indent=1):
        return self.get_indent_space(indent) + "\n"

    def get_indent_space(self, indent):
        return (" " * (4 * indent))
    

class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML):
    """The web service class details how to create files similar to the NOAA web service."""
    skip_downloading = False
    # Station data
    def process_station_data(self, row):
        """Adds a single station record file either from downloading the data or generating a similar record."""
        station_id = self.get_dly_field(row, DLY_FIELD_ID)
        download = 0
        if self.token is not "" and not self.skip_downloading:
            download = self.download_station_data(station_id, self.token, True)
            if download == 0:
                self.skip_downloading = True
        
        # If not downloaded, generate.
        if download != 0:
            return download
        else:
            # Information for each daily file.
            station_xml_file = self.default_xml_web_service_start()
            station_xml_file += self.default_xml_station_start()
            station_xml_file += self.default_xml_web_service_station(station_id)
            station_xml_file += self.default_xml_station_end()
            
            # Remove white space.
            station_xml_file = station_xml_file.replace("\n", "");
            station_xml_file = station_xml_file.replace(self.get_indent_space(1), "");

            # Make sure the station folder is available.
            ghcnd_xml_station_path = self.get_base_folder(station_id, "stations")
            if not os.path.isdir(ghcnd_xml_station_path):
                os.makedirs(ghcnd_xml_station_path)
                    
            # Save XML string to disk.
            save_file_name = ghcnd_xml_station_path + station_id + ".xml"
            save_file_name = self.save_file(save_file_name, station_xml_file)
    
            if save_file_name is not "":
                if self.debug_output:
                    print "Wrote file: " + save_file_name
                return 1
            else:
                return 0

    # Station data
    def download_station_data(self, station_id, token, reset=False):
        """Downloads the station data from the web service."""
        import time
        time.sleep(2)
        # Make sure the station folder is available.
        ghcnd_xml_station_path = self.get_base_folder(station_id, "stations")
        if not os.path.isdir(ghcnd_xml_station_path):
            os.makedirs(ghcnd_xml_station_path)
                
        # Build download URL.
        url = "http://www.ncdc.noaa.gov/cdo-services/services/datasets/GHCND/stations/GHCND:" + station_id + ".xml?token=" + token
        url_file = urllib.urlopen(url)
        station_xml_file = ""
        while (True):
            line = url_file.readline()
            if not line:
                break
            station_xml_file += line
        
        if station_xml_file.find("") != -1:
            if self.debug_output:
                print "Error in station download"
            return 0
        
        # Save XML string to disk.
        save_file_name = ghcnd_xml_station_path + station_id + ".xml"
        save_file_name = self.save_file(save_file_name, station_xml_file)
    
        if save_file_name is not "":
            if self.debug_output:
                print "Wrote file: " + save_file_name
            return 2
        else:
            return 0

    # Sensor data
    def process_one_month_sensor_set(self, records, page):
        """Generates records for a station using the web service xml layout."""
        found_data = False        
        year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR))
        month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH))
    
        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)

        # Information for each daily file.
        count = 0
        daily_xml_file = ""
        
        for day in range(1, 32):
            try:
                # TODO find out what is a valid python date range? 1889?
                # Attempt to see if this is valid date.
                report_date = date(year, month, day)

                for record in records:
                    record_xml_snip = self.default_xml_day_reading(record, report_date.day)
                    if record_xml_snip is not "":
                        daily_xml_file += self.default_xml_start_tag("data")
                        daily_xml_file += self.default_xml_field_date(report_date)
                        daily_xml_file += record_xml_snip
                        daily_xml_file += self.default_xml_end_tag("data")
                        found_data = True
                        count += 1

            except ValueError:
                pass

        daily_xml_file = self.default_xml_web_service_start() + self.default_xml_data_start(count) + daily_xml_file + self.default_xml_data_end()
        daily_xml_file = daily_xml_file.replace("\n", "");
        daily_xml_file = daily_xml_file.replace(self.get_indent_space(1), "");

        if not found_data:
            return 0

        # Make sure the station folder is available.
        ghcnd_xml_station_path = self.get_base_folder(station_id) + "/" + station_id + "/" + str(report_date.year) + "/"
        if not os.path.isdir(ghcnd_xml_station_path):
            os.makedirs(ghcnd_xml_station_path)
                
        # Save XML string to disk.
        save_file_name = ghcnd_xml_station_path + build_sensor_save_filename(station_id, report_date, page)
        save_file_name = self.save_file(save_file_name, daily_xml_file)

        if save_file_name is not "":
            if self.debug_output:
                print "Wrote file: " + save_file_name
            return 1
        else:
            return 0

def build_base_save_folder(save_path, station_id, data_type="sensors"):
    # Default
    station_prefix = station_id[:3]
    return save_path + data_type + "/" + station_prefix + "/"

def build_sensor_save_filename(station_id, report_date, page):
    # Default
    return station_id + "_" + str(report_date.year).zfill(4) + str(report_date.month).zfill(2) + "_" + str(page) + ".xml"