Weather-Project/Project Final/.ipynb_checkpoints/Project-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Project Notebook\n",
    "This is the full and complete notebook that takes in the data from NOAA and processes it into frames to be used in the PredNet architecture and produce a resulting prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Getting a list of files in raw data folder\n",
    "filenames = os.listdir('D:/Nico/Desktop/processed_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "header_wanted = [\n",
    " 'HOURLYVISIBILITY',\n",
    " 'HOURLYDRYBULBTEMPC',\n",
    " 'HOURLYWETBULBTEMPC',\n",
    " 'HOURLYDewPointTempC',\n",
    " 'HOURLYRelativeHumidity',\n",
    " 'HOURLYWindSpeed',\n",
    " 'HOURLYWindGustSpeed',\n",
    " 'HOURLYStationPressure',\n",
    " 'HOURLYPressureTendency',\n",
    " 'HOURLYPressureChange',\n",
    " 'HOURLYSeaLevelPressure',\n",
    " 'HOURLYPrecip',\n",
    " 'HOURLYAltimeterSetting']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "usecols = ['DATE','STATION'] + header_wanted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loading all files into a pandas Dataframe\n",
    "tqdm.pandas()\n",
    "df = pd.concat([pd.read_csv('D:/Nico/Desktop/processed_data/{}'.format(x), usecols=usecols, low_memory=False) for x in tqdm(filenames)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "At this point all the data has been loaded into a single dataframe and any data changes have been made. The next step is to break the data up by WBAN and place in a 2D array at the appropriate grid cell. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stations = pd.read_csv(\"../Playground/stations_unique.csv\", usecols = ['STATION_ID', 'LON_SCALED', 'LAT_SCALED'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "height = 20\n",
    "width = 40"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = [([0] * width) for i in range(height)]\n",
    "\n",
    "wban_loc = dict(zip(stations.STATION_ID,zip(stations.LON_SCALED,stations.LAT_SCALED)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = [([pd.DataFrame()] * width) for i in range(height)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for key, value in tqdm(wban_loc.items()):\n",
    "    mask[value[1]][value[0]] = 1\n",
    "    grid[value[1]][value[0]] = df.loc[df.STATION == key]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.imshow(mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#TODO Handle different sized data some stacks too short\n",
    "def create_frames(data,height, width, depth):\n",
    "    days = []\n",
    "    frames = []\n",
    "    for i in tqdm(range(depth)):\n",
    "        frame = np.zeros((height,width,12))\n",
    "        for y in range(height):\n",
    "            for x in range(width):\n",
    "                if(not data[y][x].empty):\n",
    "                    frame[y][x] = data[y][x].iloc[[i],1:13].values.flatten()\n",
    "        if((i+1)%24 != 0):\n",
    "            frames.append(frame)\n",
    "        else:\n",
    "            frames.append(frame)\n",
    "            days.append(frames)\n",
    "            frames = []\n",
    "    return days"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def average_grid_fill(mask,data, height, width):\n",
    "        \n",
    "    for i in range(height):\n",
    "        for j in range(width):\n",
    "            if(mask[i][j] != 1):\n",
    "                neighbors = get_neighbors(j,i,data)\n",
    "                data[i][j] = np.mean(neighbors)\n",
    "            \n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_neighbors(x,y,g):\n",
    "    neighbors = []\n",
    "    for i in [y-1,y,y+1]:\n",
    "        for j in [x-1,x,x+1]:\n",
    "            if(i >= 0 and j >= 0):\n",
    "                if(i != y or j != x ):\n",
    "                    try:\n",
    "                        neighbors.append(g[i][j])\n",
    "                    except:\n",
    "                        pass\n",
    "    return neighbors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_sequence(frames):\n",
    "    import hickle as hkl\n",
    "    source_list = []\n",
    "    \n",
    "    for days in range(len(frames)):\n",
    "        for day in range(len(frames[days])):\n",
    "            source_list += '{}'.format(days)\n",
    "    \n",
    "    hkl.dump(frames, './data/train/x_train.hkl')\n",
    "    hkl.dump(source_list, './data/train/x_sources.hkl')\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Splits is a dictionary holding train, test, val\n",
    "the values for train, test, and val are lists of tuples holding category and folder name\n",
    "in the end each image gets a source associated with it\n",
    "there is only one data and one source hickle dump for each of train test and val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frames = create_frames(grid, height, width,504)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#TODO use loop to average each frame\n",
    "for x in tqdm(range(len(frames))):\n",
    "    for y in range(len(frames[0])):\n",
    "        frames[x][y] = average_grid_fill(mask, frames[x][y], height, width )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "store_sequence(frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np_frames = np.array(frames)\n",
    "np_frames.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "store_sequence(np_frames)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "At this point I have processed the data and made it into discrete frames of data and it is time to run it through the PredNet architecture for training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "np.random.seed(123)\n",
    "from six.moves import cPickle\n",
    "\n",
    "from keras import backend as K\n",
    "from keras.models import Model\n",
    "from keras.layers import Input, Dense, Flatten\n",
    "from keras.layers import LSTM\n",
    "from keras.layers import TimeDistributed\n",
    "from keras.callbacks import LearningRateScheduler, ModelCheckpoint\n",
    "from keras.optimizers import Adam\n",
    "\n",
    "from prednet import PredNet\n",
    "from data_utils import SequenceGenerator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "WEIGHTS_DIR = './weights/'\n",
    "DATA_DIR = './data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_model = True  # if weights will be saved\n",
    "weights_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_weights.hdf5')  # where weights will be saved\n",
    "json_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_model.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data files\n",
    "#TODO: Use the files from NOAA and process them into proper frames\n",
    "train_file = os.path.join(DATA_DIR,'train/', 'x_train.hkl')\n",
    "train_sources = os.path.join(DATA_DIR, 'train/', 'x_sources.hkl')\n",
    "#val_file = os.path.join(DATA_DIR, 'X_val.hkl')\n",
    "#val_sources = os.path.join(DATA_DIR, 'sources_val.hkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Training parameters\n",
    "nb_epoch = 1\n",
    "batch_size = 4\n",
    "samples_per_epoch = 500\n",
    "N_seq_val = 100  # number of sequences to use for validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model parameters\n",
    "n_channels, im_height, im_width = (12, 20, 40)\n",
    "input_shape = (n_channels, im_height, im_width) if K.image_data_format() == 'channels_first' else (im_height, im_width, n_channels)\n",
    "stack_sizes = (n_channels, 48, 96)\n",
    "R_stack_sizes = stack_sizes\n",
    "A_filt_sizes = (3, 3)\n",
    "Ahat_filt_sizes = (3, 3, 3)\n",
    "R_filt_sizes = (3, 3, 3)\n",
    "layer_loss_weights = np.array([1., 0., 0.])  # weighting for each layer in final loss; \"L_0\" model:  [1, 0, 0, 0], \"L_all\": [1, 0.1, 0.1, 0.1]\n",
    "layer_loss_weights = np.expand_dims(layer_loss_weights, 1)\n",
    "nt = 24  # number of timesteps used for sequences in training\n",
    "time_loss_weights = 1./ (nt - 1) * np.ones((nt,1))  # equally weight all timesteps except the first\n",
    "time_loss_weights[0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "prednet = PredNet(stack_sizes, R_stack_sizes,\n",
    "                  A_filt_sizes, Ahat_filt_sizes, R_filt_sizes,\n",
    "                  output_mode='error', return_sequences=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = Input(shape=(nt,) + input_shape)\n",
    "errors = prednet(inputs)  # errors will be (batch_size, nt, nb_layers)\n",
    "errors_by_time = TimeDistributed(Dense(1, trainable=False), weights=[layer_loss_weights, np.zeros(1)], trainable=False)(errors)  # calculate weighted error by layer\n",
    "errors_by_time = Flatten()(errors_by_time)  # will be (batch_size, nt)\n",
    "final_errors = Dense(1, weights=[time_loss_weights, np.zeros(1)], trainable=False)(errors_by_time)  # weight errors by time\n",
    "model = Model(inputs=inputs, outputs=final_errors)\n",
    "model.compile(loss='mean_absolute_error', optimizer='adam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, 24, 20, 40, 12)    0         \n",
      "_________________________________________________________________\n",
      "pred_net_1 (PredNet)         (None, 24, 3)             1645548   \n",
      "_________________________________________________________________\n",
      "time_distributed_1 (TimeDist (None, 24, 1)             4         \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 24)                0         \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 1)                 25        \n",
      "=================================================================\n",
      "Total params: 1,645,577\n",
      "Trainable params: 1,645,548\n",
      "Non-trainable params: 29\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "truth = []\n",
    "for i in range(20):\n",
    "    truth.append(np.random.randint(255,size=(1)))\n",
    "output = np.array(truth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_generator = SequenceGenerator(train_file, train_sources, nt, batch_size=batch_size, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_schedule = lambda epoch: 0.001 if epoch < 75 else 0.0001    # start with lr of 0.001 and then drop to 0.0001 after 75 epochs\n",
    "callbacks = [LearningRateScheduler(lr_schedule)]\n",
    "#history = model.fit(np_frames, output ,batch_size, nb_epoch, callbacks=callbacks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/1\n"
     ]
    }
   ],
   "source": [
    "history = model.fit_generator(train_generator, samples_per_epoch / batch_size, nb_epoch, callbacks=callbacks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}