{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "title: \"Built-In Online Transformation Functions\"\n",
    "date: 2022-01-04\n",
    "type: technical_note\n",
    "draft: false\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Built-In Online Transformation Functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create connection to hsfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th></tr><tr><td>5</td><td>application_1641207805815_0007</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"/hopsworks-api/yarnui/https://hopsworks0.logicalclocks.com:8089/proxy/application_1641207805815_0007/\">Link</a></td><td><a target=\"_blank\" href=\"/hopsworks-api/yarnui/https://hopsworks0.logicalclocks.com:8044/node/containerlogs/container_1641207805815_0007_01_000001/demo_fs_meb10000__meb10000\">Link</a></td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n",
      "Connected. Call `.close()` to terminate connection gracefully."
     ]
    }
   ],
   "source": [
    "import hsfs\n",
    "connection = hsfs.connection()\n",
    "# get a reference to the feature store, you can access also shared feature stores by providing the feature store name\n",
    "fs = connection.get_feature_store()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate HSFS Built-In Online Transformations\n",
    "HSFS comes with built-in transformation functions such as `min_max_scaler`, `standard_scaler`, `robust_scaler` and `label_encoder`. To register these functions with the feature store call `register_builtin_transformation_functions` on the feature store handle. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fs.register_builtin_transformation_functions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get All Online Transformations Available in the Feature Store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<hsfs.transformation_function.TransformationFunction object at 0x7f90a8f0b280>, <hsfs.transformation_function.TransformationFunction object at 0x7f90a8f03ca0>, <hsfs.transformation_function.TransformationFunction object at 0x7f90a8f030d0>, <hsfs.transformation_function.TransformationFunction object at 0x7f90a8f03370>]"
     ]
    }
   ],
   "source": [
    "fs.get_transformation_functions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Online Transformation Function by Name and Version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "min_max_scaler\n",
      "1"
     ]
    }
   ],
   "source": [
    "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n",
    "print(min_max_scaler.name)\n",
    "print(min_max_scaler.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "standard_scaler\n",
      "1"
     ]
    }
   ],
   "source": [
    "standard_scaler = fs.get_transformation_function(name=\"standard_scaler\")\n",
    "print(standard_scaler.name)\n",
    "print(standard_scaler.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "robust_scaler\n",
      "1"
     ]
    }
   ],
   "source": [
    "robust_scaler = fs.get_transformation_function(name=\"robust_scaler\")\n",
    "print(robust_scaler.name)\n",
    "print(robust_scaler.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "label_encoder\n",
      "1"
     ]
    }
   ],
   "source": [
    "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
    "print(label_encoder.name)\n",
    "print(label_encoder.version)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# View Built-In Online Transformation Source Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "# Min-Max scaling\n",
      "def min_max_scaler(value, min_value, max_value):\n",
      "    return (value - min_value) / (max_value - min_value)"
     ]
    }
   ],
   "source": [
    "print(min_max_scaler.transformer_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "# Standardization / zcore\n",
      "def standard_scaler(value, mean, std_dev):\n",
      "    return (value - mean) / std_dev"
     ]
    }
   ],
   "source": [
    "print(standard_scaler.transformer_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "# Robust scaling\n",
      "def robust_scaler(value, p25, p50, p75):\n",
      "    return (value - p50) / (p75 - p25)"
     ]
    }
   ],
   "source": [
    "print(robust_scaler.transformer_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "# label encoder\n",
      "def label_encoder(value, value_to_index):\n",
      "    # define a mapping of values to integers\n",
      "    return value_to_index[value]"
     ]
    }
   ],
   "source": [
    "print(label_encoder.transformer_code)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Training Dataset With Online Transformation\n",
    "### To use online transformation function for training dataset it must be created from an HSFS `Query` object.\n",
    "\n",
    "Here we assume that the feature group `economy_fg` version 2 is already created. Otherwise please run notebook `time_travel/time_travel_python.ipynb` first.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "economy_fg = fs.get_feature_group('economy_fg',2)\n",
    "demography_fg = fs.get_feature_group('demography_fg',2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+---+---------+----------+-----+--------+------+---------+\n",
      "|year| id|   salary|commission|  car|  hvalue|hyears|     loan|\n",
      "+----+---+---------+----------+-----+--------+------+---------+\n",
      "|2020|  2|160893.77|       0.0|car10|179000.0|     2|455015.34|\n",
      "|2020|  5| 93956.32|       0.0|car15|135000.0|     1| 458679.8|\n",
      "|2020|  6| 41365.43|  52809.15| car7|135000.0|    19| 216839.7|\n",
      "|2020|  7| 94805.61|       0.0|car17|135000.0|    23|233216.06|\n",
      "|2020|  8| 64410.62|  39884.39|car20|125000.0|     6|350707.38|\n",
      "|2020|  9|128298.82|       0.0|car19|135000.0|    12| 20768.06|\n",
      "|2020| 10|100806.92|       0.0| car8|135000.0|     6|293106.66|\n",
      "|2020|  3|119159.65|       0.0| car1|145000.0|    22|122025.08|\n",
      "|2020|  4|  20000.0|  52593.63| car9|185000.0|    30| 99629.62|\n",
      "|2020|  1|120499.73|       0.0|car17|205000.0|    30| 564724.2|\n",
      "+----+---+---------+----------+-----+--------+------+---------+"
     ]
    }
   ],
   "source": [
    "economy_fg.read().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- year: integer (nullable = true)\n",
      " |-- id: integer (nullable = true)\n",
      " |-- salary: float (nullable = true)\n",
      " |-- commission: float (nullable = true)\n",
      " |-- car: string (nullable = true)\n",
      " |-- hvalue: float (nullable = true)\n",
      " |-- hyears: integer (nullable = true)\n",
      " |-- loan: float (nullable = true)"
     ]
    }
   ],
   "source": [
    "economy_fg.read().printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the training dataset from from an HSFS `Query` object "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = demography_fg.select(['age','elevel','zipcode']).join(economy_fg.select_all())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------+--------+----+---+---------+----------+-----+--------+------+---------+\n",
      "|age|elevel| zipcode|year| id|   salary|commission|  car|  hvalue|hyears|     loan|\n",
      "+---+------+--------+----+---+---------+----------+-----+--------+------+---------+\n",
      "| 54|level3|zipcode5|2020|  1|120499.73|       0.0|car17|205000.0|    30| 564724.2|\n",
      "| 71|level2|zipcode3|2020|  6| 41365.43|  52809.15| car7|135000.0|    19| 216839.7|\n",
      "| 49|level2|zipcode4|2020|  3|119159.65|       0.0| car1|145000.0|    22|122025.08|\n",
      "| 59|level1|zipcode2|2020|  5| 93956.32|       0.0|car15|135000.0|     1| 458679.8|\n",
      "| 32|level1|zipcode3|2020|  9|128298.82|       0.0|car19|135000.0|    12| 20768.06|\n",
      "+---+------+--------+----+---+---------+----------+-----+--------+------+---------+\n",
      "only showing top 5 rows"
     ]
    }
   ],
   "source": [
    "query.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Provide transformation functions as dict, where key is feature name and value is the name of the online transformation function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "td = fs.create_training_dataset(name=\"economy_td\",\n",
    "                               description=\"Dataset to train the some model\",\n",
    "                               data_format=\"csv\",\n",
    "                               transformation_functions={\"hyears\":min_max_scaler, \n",
    "                                                         \"loan\":standard_scaler,\n",
    "                                                         \"salary\":robust_scaler,\n",
    "                                                         \"elevel\":label_encoder},\n",
    "                               version=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "td.save(query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Online tranformation functions are now attached to training dataset as medadata and contain information to which feature groups they will be applied "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "VersionWarning: No version provided for getting training dataset `economy_td`, defaulting to `1`."
     ]
    }
   ],
   "source": [
    "td = fs.get_training_dataset(\"economy_td\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'salary': <hsfs.transformation_function.TransformationFunction object at 0x7f90a8eb30a0>, 'hyears': <hsfs.transformation_function.TransformationFunction object at 0x7f90a8eb32b0>, 'elevel': <hsfs.transformation_function.TransformationFunction object at 0x7f90a8f15490>, 'loan': <hsfs.transformation_function.TransformationFunction object at 0x7f90a8f15f10>}"
     ]
    }
   ],
   "source": [
    "td.transformation_functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------+--------+----+---+-------------------+----------+-----+--------+--------------------+--------------------+\n",
      "|age|elevel| zipcode|year| id|             salary|commission|  car|  hvalue|              hyears|                loan|\n",
      "+---+------+--------+----+---+-------------------+----------+-----+--------+--------------------+--------------------+\n",
      "| 71|     1|zipcode3|2020|  6|-1.0597689460499202|  52809.15| car7|135000.0|  0.6206896551724138|-0.38649341373235696|\n",
      "| 33|     1|zipcode1|2020|  8|-0.6489014306268116|  39884.39|car20|125000.0|  0.1724137931034483|  0.4140293078456086|\n",
      "| 32|     3|zipcode2|2020|  7|-0.1069960458425144|       0.0|car17|135000.0|  0.7586206896551724| -0.2885635133105802|\n",
      "| 44|     3|zipcode8|2020|  2| 1.0712747733778236|       0.0|car10|179000.0|0.034482758620689655|  1.0377863199492403|\n",
      "| 49|     1|zipcode4|2020|  3|0.32720661898815995|       0.0| car1|145000.0|  0.7241379310344828| -0.9534806526026021|\n",
      "| 32|     3|zipcode3|2020|  9|0.49014685129569574|       0.0|car19|135000.0|  0.3793103448275862|  -1.558993112445764|\n",
      "| 56|     0|zipcode2|2020|  4|-1.4406883689742835|  52593.63| car9|185000.0|                 1.0| -1.0874045125575358|\n",
      "| 54|     2|zipcode5|2020|  1|0.35109856937318834|       0.0|car17|205000.0|                 1.0|  1.6938403242493116|\n",
      "| 58|     1|zipcode5|2020| 10|                0.0|       0.0| car8|135000.0|  0.1724137931034483| 0.06957957257529503|\n",
      "| 59|     3|zipcode2|2020|  5|-0.1221378288216204|       0.0|car15|135000.0|                 0.0|   1.059699680029384|\n",
      "+---+------+--------+----+---+-------------------+----------+-----+--------+--------------------+--------------------+"
     ]
    }
   ],
   "source": [
    "td.read().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Transformation functions will also be applied to feature vectors retrieved by `get_serving_vector` method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id'}"
     ]
    }
   ],
   "source": [
    "td_meta = fs.get_training_dataset(\"economy_td\", 1)\n",
    "#`init_prepared_statement` method is needed to get serving_keys in case `get_serving_vector` has not beed called yet. This is not necessary for `get_serving_vector` method itself\n",
    "td_meta.init_prepared_statement() \n",
    "td_meta.serving_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[54, 2, 'zipcode5', 2020, 1, 0.3511034444286508, 0.0, 'car17', 205000.0, 1.0, 1.6938392030076541]"
     ]
    }
   ],
   "source": [
    "td_meta.get_serving_vector({'id': 1})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training dataset With Splits\n",
    "\n",
    "When creating a training dataset with multiple random splits, the user needs to tell HSFS which split is going to be used for training by supplying the `train_split` argument. The statistics such as min/max/std/mean of this split will be used for the online transformations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "td_with_splits = fs.create_training_dataset(name=\"economy_td\",\n",
    "                                            description=\"Dataset to train the some model\",\n",
    "                                            data_format=\"csv\",\n",
    "                                            transformation_functions={\"hyears\":min_max_scaler, \n",
    "                                                                      \"loan\":standard_scaler,\n",
    "                                                                      \"salary\":robust_scaler},\n",
    "                                            splits={'train': 0.7, 'test': 0.2, 'validate': 0.1},\n",
    "                                            train_split = 'train',            \n",
    "                                            version=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train': 0.7, 'test': 0.2, 'validate': 0.1}"
     ]
    }
   ],
   "source": [
    "td_with_splits.splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "td_with_splits.save(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "td_meta = fs.get_training_dataset(\"economy_td\", 2)\n",
    "td_meta.serving_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[44, 'level1', 'zipcode8', 2020, 2, 1.207115337594111, 0.0, 'car10', 179000.0, 0.034482758620689655, 1.2437140275646557]"
     ]
    }
   ],
   "source": [
    "td_meta.get_serving_vector({'id': 2})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark",
   "language": "python",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 3
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}