FunWithAI/regression_example.py

283 lines
7.8 KiB
Python

"""
Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Makes numpy stuff easier readable
np.set_printoptions(precision=3, suppress=True)
def load_data():
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(url, names=column_names,
na_values='?', comment='\t',
sep=' ', skipinitialspace=True)
return raw_dataset
def plot_loss(history):
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Error [MPG]')
plt.legend()
plt.grid(True)
plt.show()
def plot_horsepower(x, y, train_features, train_labels):
plt.scatter(train_features['Horsepower'], train_labels, label='Data')
plt.plot(x, y, color='k', label='Predictions')
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.legend()
plt.show()
def regression_single_input():
raw_dataset = load_data()
dataset = raw_dataset.copy()
#################################
### Prepare the training data ###
#################################
# Print the end of the dataset
print(dataset.tail())
# Check for any undefined values
print(dataset.isna().sum())
# Drops unknown values
dataset = dataset.dropna()
# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# Split into training and test data
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
# Print pair plots of the data to see if there are any probable correlations
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
plt.show()
# Print statistics about the features
print(train_dataset.describe().transpose())
# Separate the label from the features
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
###########################
### Normalize the model ###
###########################
# Create the normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
# Fit the preprocessing layer to the data
normalizer.adapt(np.array(train_features))
# See what effect the normalization has
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
#######################################
### Start with the regression stuff ###
#######################################
# Create Horsepower NP array and normalize it
horsepower = np.array(train_features['Horsepower'])
horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
horsepower_normalizer.adapt(horsepower)
# Build the Keras Sequential Model
# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
horsepower_model = tf.keras.Sequential([
horsepower_normalizer,
layers.Dense(units=1)
])
print(horsepower_model.summary())
# Run the model without training
print(horsepower_model.predict(horsepower[:10]))
# Configure the training process
horsepower_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error'
)
# Train for 100 epochs
history = horsepower_model.fit(
train_features['Horsepower'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2
)
# Show training progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
# Show loss plot
plot_loss(history)
# Collect results on test set
test_results = {}
test_results['horsepower_model'] = horsepower_model.evaluate(
test_features['Horsepower'],
test_labels, verbose=0
)
# Plot the regression line
x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)
plot_horsepower(x, y, train_features, train_labels)
def regression_multiple_inputs():
raw_dataset = load_data()
dataset = raw_dataset.copy()
#################################
### Prepare the training data ###
#################################
# Print the end of the dataset
print(dataset.tail())
# Check for any undefined values
print(dataset.isna().sum())
# Drops unknown values
dataset = dataset.dropna()
# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# Split into training and test data
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
# Print pair plots of the data to see if there are any probable correlations
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
plt.show()
# Print statistics about the features
print(train_dataset.describe().transpose())
# Separate the label from the features
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
###########################
### Normalize the model ###
###########################
# Create the normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
# Fit the preprocessing layer to the data
normalizer.adapt(np.array(train_features))
# See what effect the normalization has
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
#######################################
### Start with the regression stuff ###
#######################################
# Create Horsepower NP array and normalize it
horsepower = np.array(train_features['Horsepower'])
horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
horsepower_normalizer.adapt(horsepower)
# Build the Keras Sequential Model
# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
linear_model = tf.keras.Sequential([
normalizer,
layers.Dense(units=1)
])
# Run the model without training
print(linear_model.predict(train_features[:10]))
# Configure the training process
linear_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error'
)
# Train for 100 epochs
history = linear_model.fit(
train_features,
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2
)
# Show training progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
# Show loss plot
plot_loss(history)
# Collect results on test set
test_results = {}
test_results['linear_model'] = linear_model.evaluate(
test_features, test_labels, verbose=0
)
# Plot the regression line
# Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart
#x = tf.linspace(0.0, 250, 251)
#y = linear_model.predict(x)
#plot_horsepower(x, y, train_features, train_labels)
if __name__ == '__main__':
regression_single_input()
regression_multiple_inputs()