533 lines
14 KiB
Plaintext
533 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Project Notebook\n",
|
|
"This is the full and complete notebook that takes in the data from NOAA and processes it into frames to be used in the PredNet architecture and produce a resulting prediction."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import os\n",
|
|
"from tqdm import tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Getting a list of files in raw data folder\n",
|
|
"filenames = os.listdir('D:/Nico/Desktop/processed_data')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"header_wanted = [\n",
|
|
" 'HOURLYVISIBILITY',\n",
|
|
" 'HOURLYDRYBULBTEMPC',\n",
|
|
" 'HOURLYWETBULBTEMPC',\n",
|
|
" 'HOURLYDewPointTempC',\n",
|
|
" 'HOURLYRelativeHumidity',\n",
|
|
" 'HOURLYWindSpeed',\n",
|
|
" 'HOURLYWindGustSpeed',\n",
|
|
" 'HOURLYStationPressure',\n",
|
|
" 'HOURLYPressureTendency',\n",
|
|
" 'HOURLYPressureChange',\n",
|
|
" 'HOURLYSeaLevelPressure',\n",
|
|
" 'HOURLYPrecip',\n",
|
|
" 'HOURLYAltimeterSetting']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"usecols = ['DATE','STATION'] + header_wanted"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Loading all files into a pandas Dataframe\n",
|
|
"tqdm.pandas()\n",
|
|
"df = pd.concat([pd.read_csv('D:/Nico/Desktop/processed_data/{}'.format(x), usecols=usecols, low_memory=False) for x in tqdm(filenames)])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"At this point all the data has been loaded into a single dataframe and any data changes have been made. The next step is to break the data up by WBAN and place in a 2D array at the appropriate grid cell. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"stations = pd.read_csv(\"../Playground/stations_unique.csv\", usecols = ['STATION_ID', 'LON_SCALED', 'LAT_SCALED'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"height = 20\n",
|
|
"width = 40"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mask = [([0] * width) for i in range(height)]\n",
|
|
"\n",
|
|
"wban_loc = dict(zip(stations.STATION_ID,zip(stations.LON_SCALED,stations.LAT_SCALED)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"grid = [([pd.DataFrame()] * width) for i in range(height)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for key, value in tqdm(wban_loc.items()):\n",
|
|
" mask[value[1]][value[0]] = 1\n",
|
|
" grid[value[1]][value[0]] = df.loc[df.STATION == key]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"%matplotlib inline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.imshow(mask)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#TODO Handle different sized data some stacks too short\n",
|
|
"def create_frames(data,height, width, depth):\n",
|
|
" days = []\n",
|
|
" frames = []\n",
|
|
" for i in tqdm(range(depth)):\n",
|
|
" frame = np.zeros((height,width,12))\n",
|
|
" for y in range(height):\n",
|
|
" for x in range(width):\n",
|
|
" if(not data[y][x].empty):\n",
|
|
" frame[y][x] = data[y][x].iloc[[i],1:13].values.flatten()\n",
|
|
" if((i+1)%24 != 0):\n",
|
|
" frames.append(frame)\n",
|
|
" else:\n",
|
|
" frames.append(frame)\n",
|
|
" days.append(frames)\n",
|
|
" frames = []\n",
|
|
" return days"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def average_grid_fill(mask,data, height, width):\n",
|
|
" \n",
|
|
" for i in range(height):\n",
|
|
" for j in range(width):\n",
|
|
" if(mask[i][j] != 1):\n",
|
|
" neighbors = get_neighbors(j,i,data)\n",
|
|
" data[i][j] = np.mean(neighbors)\n",
|
|
" \n",
|
|
" return data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_neighbors(x,y,g):\n",
|
|
" neighbors = []\n",
|
|
" for i in [y-1,y,y+1]:\n",
|
|
" for j in [x-1,x,x+1]:\n",
|
|
" if(i >= 0 and j >= 0):\n",
|
|
" if(i != y or j != x ):\n",
|
|
" try:\n",
|
|
" neighbors.append(g[i][j])\n",
|
|
" except:\n",
|
|
" pass\n",
|
|
" return neighbors"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def store_sequence(frames):\n",
|
|
" import hickle as hkl\n",
|
|
" source_list = []\n",
|
|
" \n",
|
|
" for days in range(len(frames)):\n",
|
|
" for day in range(len(frames[days])):\n",
|
|
" source_list += '{}'.format(days)\n",
|
|
" \n",
|
|
" hkl.dump(frames, './data/train/x_train.hkl')\n",
|
|
" hkl.dump(source_list, './data/train/x_sources.hkl')\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Splits is a dictionary holding train, test, val\n",
|
|
"the values for train, test, and val are lists of tuples holding category and folder name\n",
|
|
"in the end each image gets a source associated with it\n",
|
|
"there is only one data and one source hickle dump for each of train test and val"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"frames = create_frames(grid, height, width,504)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#TODO use loop to average each frame\n",
|
|
"for x in tqdm(range(len(frames))):\n",
|
|
" for y in range(len(frames[0])):\n",
|
|
" frames[x][y] = average_grid_fill(mask, frames[x][y], height, width )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"store_sequence(frames)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"np_frames = np.array(frames)\n",
|
|
"np_frames.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"store_sequence(np_frames)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"At this point I have processed the data and made it into discrete frames of data and it is time to run it through the PredNet architecture for training."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Using TensorFlow backend.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"np.random.seed(123)\n",
|
|
"from six.moves import cPickle\n",
|
|
"\n",
|
|
"from keras import backend as K\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Input, Dense, Flatten\n",
|
|
"from keras.layers import LSTM\n",
|
|
"from keras.layers import TimeDistributed\n",
|
|
"from keras.callbacks import LearningRateScheduler, ModelCheckpoint\n",
|
|
"from keras.optimizers import Adam\n",
|
|
"\n",
|
|
"from prednet import PredNet\n",
|
|
"from data_utils import SequenceGenerator"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"WEIGHTS_DIR = './weights/'\n",
|
|
"DATA_DIR = './data/'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"save_model = True # if weights will be saved\n",
|
|
"weights_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_weights.hdf5') # where weights will be saved\n",
|
|
"json_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_model.json')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Data files\n",
|
|
"#TODO: Use the files from NOAA and process them into proper frames\n",
|
|
"train_file = os.path.join(DATA_DIR,'train/', 'x_train.hkl')\n",
|
|
"train_sources = os.path.join(DATA_DIR, 'train/', 'x_sources.hkl')\n",
|
|
"#val_file = os.path.join(DATA_DIR, 'X_val.hkl')\n",
|
|
"#val_sources = os.path.join(DATA_DIR, 'sources_val.hkl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Training parameters\n",
|
|
"nb_epoch = 1\n",
|
|
"batch_size = 4\n",
|
|
"samples_per_epoch = 500\n",
|
|
"N_seq_val = 100 # number of sequences to use for validation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Model parameters\n",
|
|
"n_channels, im_height, im_width = (12, 20, 40)\n",
|
|
"input_shape = (n_channels, im_height, im_width) if K.image_data_format() == 'channels_first' else (im_height, im_width, n_channels)\n",
|
|
"stack_sizes = (n_channels, 48, 96)\n",
|
|
"R_stack_sizes = stack_sizes\n",
|
|
"A_filt_sizes = (3, 3)\n",
|
|
"Ahat_filt_sizes = (3, 3, 3)\n",
|
|
"R_filt_sizes = (3, 3, 3)\n",
|
|
"layer_loss_weights = np.array([1., 0., 0.]) # weighting for each layer in final loss; \"L_0\" model: [1, 0, 0, 0], \"L_all\": [1, 0.1, 0.1, 0.1]\n",
|
|
"layer_loss_weights = np.expand_dims(layer_loss_weights, 1)\n",
|
|
"nt = 24 # number of timesteps used for sequences in training\n",
|
|
"time_loss_weights = 1./ (nt - 1) * np.ones((nt,1)) # equally weight all timesteps except the first\n",
|
|
"time_loss_weights[0] = 0"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"prednet = PredNet(stack_sizes, R_stack_sizes,\n",
|
|
" A_filt_sizes, Ahat_filt_sizes, R_filt_sizes,\n",
|
|
" output_mode='error', return_sequences=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"inputs = Input(shape=(nt,) + input_shape)\n",
|
|
"errors = prednet(inputs) # errors will be (batch_size, nt, nb_layers)\n",
|
|
"errors_by_time = TimeDistributed(Dense(1, trainable=False), weights=[layer_loss_weights, np.zeros(1)], trainable=False)(errors) # calculate weighted error by layer\n",
|
|
"errors_by_time = Flatten()(errors_by_time) # will be (batch_size, nt)\n",
|
|
"final_errors = Dense(1, weights=[time_loss_weights, np.zeros(1)], trainable=False)(errors_by_time) # weight errors by time\n",
|
|
"model = Model(inputs=inputs, outputs=final_errors)\n",
|
|
"model.compile(loss='mean_absolute_error', optimizer='adam')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"_________________________________________________________________\n",
|
|
"Layer (type) Output Shape Param # \n",
|
|
"=================================================================\n",
|
|
"input_1 (InputLayer) (None, 24, 20, 40, 12) 0 \n",
|
|
"_________________________________________________________________\n",
|
|
"pred_net_1 (PredNet) (None, 24, 3) 1645548 \n",
|
|
"_________________________________________________________________\n",
|
|
"time_distributed_1 (TimeDist (None, 24, 1) 4 \n",
|
|
"_________________________________________________________________\n",
|
|
"flatten_1 (Flatten) (None, 24) 0 \n",
|
|
"_________________________________________________________________\n",
|
|
"dense_2 (Dense) (None, 1) 25 \n",
|
|
"=================================================================\n",
|
|
"Total params: 1,645,577\n",
|
|
"Trainable params: 1,645,548\n",
|
|
"Non-trainable params: 29\n",
|
|
"_________________________________________________________________\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model.summary()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"truth = []\n",
|
|
"for i in range(20):\n",
|
|
" truth.append(np.random.randint(255,size=(1)))\n",
|
|
"output = np.array(truth)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_generator = SequenceGenerator(train_file, train_sources, nt, batch_size=batch_size, shuffle=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_schedule = lambda epoch: 0.001 if epoch < 75 else 0.0001 # start with lr of 0.001 and then drop to 0.0001 after 75 epochs\n",
|
|
"callbacks = [LearningRateScheduler(lr_schedule)]\n",
|
|
"#history = model.fit(np_frames, output ,batch_size, nb_epoch, callbacks=callbacks)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"history = model.fit_generator(train_generator, samples_per_epoch / batch_size, nb_epoch, callbacks=callbacks)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|