---
title: "Feature group with complex feature"
date: 2022-03-14
type: technical_note
draft: false
---

# Create feature groups with complex feature types, such as arrays.

In [16]:
# pandas and numpy
import numpy as np
from numpy.random import default_rng
import pandas as pd

# pyspark functions
from pyspark.sql import functions as F
from pyspark.sql.functions import array, coalesce, concat,  col

## Generate dataframe with array type featuers  

In [19]:
seed = 42
np_data = []
for i in range(1000):
    np_data.append([i,default_rng(42).random((100)).tolist()])
df_with_complex_ft=sc.parallelize(np_data).toDF(['id','array_ft'])    
df_with_complex_ft.show()

+---+--------------------+
| id|            array_ft|
+---+--------------------+
|  0|[0.77395604855596...|
|  1|[0.77395604855596...|
|  2|[0.77395604855596...|
|  3|[0.77395604855596...|
|  4|[0.77395604855596...|
|  5|[0.77395604855596...|
|  6|[0.77395604855596...|
|  7|[0.77395604855596...|
|  8|[0.77395604855596...|
|  9|[0.77395604855596...|
| 10|[0.77395604855596...|
| 11|[0.77395604855596...|
| 12|[0.77395604855596...|
| 13|[0.77395604855596...|
| 14|[0.77395604855596...|
| 15|[0.77395604855596...|
| 16|[0.77395604855596...|
| 17|[0.77395604855596...|
| 18|[0.77395604855596...|
| 19|[0.77395604855596...|
+---+--------------------+
only showing top 20 rows

## Create a connection to hsfs

In [20]:
import hsfs
from hops import hdfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Create feature group with complex type of features

In [22]:
extra_hudi_options = {
    "hoodie.bulkinsert.shuffle.parallelism":"1", 
    "hoodie.insert.shuffle.parallelism":"1", 
    "hoodie.upsert.shuffle.parallelism":"1",
    "hoodie.parquet.compression.ratio":"0.5"
}

fg_with_complex_ft = fs.create_feature_group(name="fg_with_complex_ft",
                                             version=1,
                                             primary_key=["id"],
                                             description="feature group with complex type of features",
                                             time_travel_format="HUDI",     
                                             online_enabled=True,                                                
                                             statistics_config=False
                                            )

fg_with_complex_ft.save(df_with_complex_ft, extra_hudi_options)

## Create training dataset from feature group with complex type of features

In [28]:
fg_with_complex_ft = fs.get_feature_group("fg_with_complex_ft",1)
query = fg_with_complex_ft.select_all()
td = fs.create_training_dataset(name="td_with_complex_ft",
                                       version=1,
                                       data_format="tfrecord",
                                       statistics_config=False, 
                                       coalesce=True,
                                       description="training dataset with complex type of features")
td.save(query)

## Retrieve feature vector from online feature store

In [30]:
td = fs.get_training_dataset("td_with_complex_ft",1)

In [None]:
td.get_serving_vector({"id":1})