""" Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression """ import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers # Makes numpy stuff easier readable np.set_printoptions(precision=3, suppress=True) def load_data(): url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data' column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin'] raw_dataset = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True) return raw_dataset def plot_loss(history): plt.plot(history.history['loss'], label='loss') plt.plot(history.history['val_loss'], label='val_loss') plt.ylim([0, 10]) plt.xlabel('Epoch') plt.ylabel('Error [MPG]') plt.legend() plt.grid(True) plt.show() def plot_horsepower(x, y, train_features, train_labels): plt.scatter(train_features['Horsepower'], train_labels, label='Data') plt.plot(x, y, color='k', label='Predictions') plt.xlabel('Horsepower') plt.ylabel('MPG') plt.legend() plt.show() def regression_single_input(): raw_dataset = load_data() dataset = raw_dataset.copy() ################################# ### Prepare the training data ### ################################# # Print the end of the dataset print(dataset.tail()) # Check for any undefined values print(dataset.isna().sum()) # Drops unknown values dataset = dataset.dropna() # Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values. dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'}) dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='') # Split into training and test data train_dataset = dataset.sample(frac=0.8, random_state=0) test_dataset = dataset.drop(train_dataset.index) # Print pair plots of the data to see if there are any probable correlations sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde') plt.show() # Print statistics about the features print(train_dataset.describe().transpose()) # Separate the label from the features train_features = train_dataset.copy() test_features = test_dataset.copy() train_labels = train_features.pop('MPG') test_labels = test_features.pop('MPG') ########################### ### Normalize the model ### ########################### # Create the normalization layer normalizer = tf.keras.layers.Normalization(axis=-1) # Fit the preprocessing layer to the data normalizer.adapt(np.array(train_features)) # See what effect the normalization has first = np.array(train_features[:1]) with np.printoptions(precision=2, suppress=True): print('First example:', first) print() print('Normalized:', normalizer(first).numpy()) ####################################### ### Start with the regression stuff ### ####################################### # Create Horsepower NP array and normalize it horsepower = np.array(train_features['Horsepower']) horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None) horsepower_normalizer.adapt(horsepower) # Build the Keras Sequential Model # Apply a linear transformation to produce 1 output using a linear layer (layers.Dense) horsepower_model = tf.keras.Sequential([ horsepower_normalizer, layers.Dense(units=1) ]) print(horsepower_model.summary()) # Run the model without training print(horsepower_model.predict(horsepower[:10])) # Configure the training process horsepower_model.compile( optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error' ) # Train for 100 epochs history = horsepower_model.fit( train_features['Horsepower'], train_labels, epochs=100, # Suppress logging. verbose=0, # Calculate validation results on 20% of the training data. validation_split=0.2 ) # Show training progress hist = pd.DataFrame(history.history) hist['epoch'] = history.epoch print(hist.tail()) # Show loss plot plot_loss(history) # Collect results on test set test_results = {} test_results['horsepower_model'] = horsepower_model.evaluate( test_features['Horsepower'], test_labels, verbose=0 ) # Plot the regression line x = tf.linspace(0.0, 250, 251) y = horsepower_model.predict(x) plot_horsepower(x, y, train_features, train_labels) def regression_multiple_inputs(): raw_dataset = load_data() dataset = raw_dataset.copy() ################################# ### Prepare the training data ### ################################# # Print the end of the dataset print(dataset.tail()) # Check for any undefined values print(dataset.isna().sum()) # Drops unknown values dataset = dataset.dropna() # Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values. dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'}) dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='') # Split into training and test data train_dataset = dataset.sample(frac=0.8, random_state=0) test_dataset = dataset.drop(train_dataset.index) # Print pair plots of the data to see if there are any probable correlations sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde') plt.show() # Print statistics about the features print(train_dataset.describe().transpose()) # Separate the label from the features train_features = train_dataset.copy() test_features = test_dataset.copy() train_labels = train_features.pop('MPG') test_labels = test_features.pop('MPG') ########################### ### Normalize the model ### ########################### # Create the normalization layer normalizer = tf.keras.layers.Normalization(axis=-1) # Fit the preprocessing layer to the data normalizer.adapt(np.array(train_features)) # See what effect the normalization has first = np.array(train_features[:1]) with np.printoptions(precision=2, suppress=True): print('First example:', first) print() print('Normalized:', normalizer(first).numpy()) ####################################### ### Start with the regression stuff ### ####################################### # Create Horsepower NP array and normalize it horsepower = np.array(train_features['Horsepower']) horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None) horsepower_normalizer.adapt(horsepower) # Build the Keras Sequential Model # Apply a linear transformation to produce 1 output using a linear layer (layers.Dense) linear_model = tf.keras.Sequential([ normalizer, layers.Dense(units=1) ]) # Run the model without training print(linear_model.predict(train_features[:10])) # Configure the training process linear_model.compile( optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error' ) # Train for 100 epochs history = linear_model.fit( train_features, train_labels, epochs=100, # Suppress logging. verbose=0, # Calculate validation results on 20% of the training data. validation_split=0.2 ) # Show training progress hist = pd.DataFrame(history.history) hist['epoch'] = history.epoch print(hist.tail()) # Show loss plot plot_loss(history) # Collect results on test set test_results = {} test_results['linear_model'] = linear_model.evaluate( test_features, test_labels, verbose=0 ) # Plot the regression line # Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart #x = tf.linspace(0.0, 250, 251) #y = linear_model.predict(x) #plot_horsepower(x, y, train_features, train_labels) if __name__ == '__main__': regression_single_input() regression_multiple_inputs()