"""
Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Makes numpy stuff easier readable
np.set_printoptions(precision=3, suppress=True)


def load_data():
	url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
	column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
					'Acceleration', 'Model Year', 'Origin']

	raw_dataset = pd.read_csv(url, names=column_names,
							  na_values='?', comment='\t',
							  sep=' ', skipinitialspace=True)

	return raw_dataset


def plot_loss(history):
	plt.plot(history.history['loss'], label='loss')
	plt.plot(history.history['val_loss'], label='val_loss')
	plt.ylim([0, 10])
	plt.xlabel('Epoch')
	plt.ylabel('Error [MPG]')
	plt.legend()
	plt.grid(True)
	plt.show()


def plot_horsepower(x, y, train_features, train_labels):
	plt.scatter(train_features['Horsepower'], train_labels, label='Data')
	plt.plot(x, y, color='k', label='Predictions')
	plt.xlabel('Horsepower')
	plt.ylabel('MPG')
	plt.legend()
	plt.show()

def regression_single_input():
	raw_dataset = load_data()
	dataset = raw_dataset.copy()

	#################################
	### Prepare the training data ###
	#################################

	# Print the end of the dataset
	print(dataset.tail())

	# Check for any undefined values
	print(dataset.isna().sum())

	# Drops unknown values
	dataset = dataset.dropna()

	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')

	# Split into training and test data
	train_dataset = dataset.sample(frac=0.8, random_state=0)
	test_dataset = dataset.drop(train_dataset.index)

	# Print pair plots of the data to see if there are any probable correlations
	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
	plt.show()

	# Print statistics about the features
	print(train_dataset.describe().transpose())

	# Separate the label from the features
	train_features = train_dataset.copy()
	test_features = test_dataset.copy()

	train_labels = train_features.pop('MPG')
	test_labels = test_features.pop('MPG')

	###########################
	### Normalize the model ###
	###########################

	# Create the normalization layer
	normalizer = tf.keras.layers.Normalization(axis=-1)

	# Fit the preprocessing layer to the data
	normalizer.adapt(np.array(train_features))

	# See what effect the normalization has
	first = np.array(train_features[:1])

	with np.printoptions(precision=2, suppress=True):
		print('First example:', first)
		print()
		print('Normalized:', normalizer(first).numpy())

	#######################################
	### Start with the regression stuff ###
	#######################################

	# Create Horsepower NP array and normalize it
	horsepower = np.array(train_features['Horsepower'])

	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
	horsepower_normalizer.adapt(horsepower)

	# Build the Keras Sequential Model
	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
	horsepower_model = tf.keras.Sequential([
		horsepower_normalizer,
		layers.Dense(units=1)
	])

	print(horsepower_model.summary())

	# Run the model without training
	print(horsepower_model.predict(horsepower[:10]))

	# Configure the training process
	horsepower_model.compile(
		optimizer=tf.optimizers.Adam(learning_rate=0.1),
		loss='mean_absolute_error'
	)

	# Train for 100 epochs
	history = horsepower_model.fit(
		train_features['Horsepower'],
		train_labels,
		epochs=100,
		# Suppress logging.
		verbose=0,
		# Calculate validation results on 20% of the training data.
		validation_split=0.2
	)

	# Show training progress
	hist = pd.DataFrame(history.history)
	hist['epoch'] = history.epoch
	print(hist.tail())

	# Show loss plot
	plot_loss(history)

	# Collect results on test set
	test_results = {}

	test_results['horsepower_model'] = horsepower_model.evaluate(
		test_features['Horsepower'],
		test_labels, verbose=0
	)

	# Plot the regression line
	x = tf.linspace(0.0, 250, 251)
	y = horsepower_model.predict(x)
	plot_horsepower(x, y, train_features, train_labels)

def regression_multiple_inputs():
	raw_dataset = load_data()
	dataset = raw_dataset.copy()

	#################################
	### Prepare the training data ###
	#################################

	# Print the end of the dataset
	print(dataset.tail())

	# Check for any undefined values
	print(dataset.isna().sum())

	# Drops unknown values
	dataset = dataset.dropna()

	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')

	# Split into training and test data
	train_dataset = dataset.sample(frac=0.8, random_state=0)
	test_dataset = dataset.drop(train_dataset.index)

	# Print pair plots of the data to see if there are any probable correlations
	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
	plt.show()

	# Print statistics about the features
	print(train_dataset.describe().transpose())

	# Separate the label from the features
	train_features = train_dataset.copy()
	test_features = test_dataset.copy()

	train_labels = train_features.pop('MPG')
	test_labels = test_features.pop('MPG')

	###########################
	### Normalize the model ###
	###########################

	# Create the normalization layer
	normalizer = tf.keras.layers.Normalization(axis=-1)

	# Fit the preprocessing layer to the data
	normalizer.adapt(np.array(train_features))

	# See what effect the normalization has
	first = np.array(train_features[:1])

	with np.printoptions(precision=2, suppress=True):
		print('First example:', first)
		print()
		print('Normalized:', normalizer(first).numpy())

	#######################################
	### Start with the regression stuff ###
	#######################################

	# Create Horsepower NP array and normalize it
	horsepower = np.array(train_features['Horsepower'])

	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
	horsepower_normalizer.adapt(horsepower)

	# Build the Keras Sequential Model
	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
	linear_model = tf.keras.Sequential([
		normalizer,
		layers.Dense(units=1)
	])

	# Run the model without training
	print(linear_model.predict(train_features[:10]))

	# Configure the training process
	linear_model.compile(
		optimizer=tf.optimizers.Adam(learning_rate=0.1),
		loss='mean_absolute_error'
	)

	# Train for 100 epochs
	history = linear_model.fit(
		train_features,
		train_labels,
		epochs=100,
		# Suppress logging.
		verbose=0,
		# Calculate validation results on 20% of the training data.
		validation_split=0.2
	)

	# Show training progress
	hist = pd.DataFrame(history.history)
	hist['epoch'] = history.epoch
	print(hist.tail())

	# Show loss plot
	plot_loss(history)

	# Collect results on test set
	test_results = {}

	test_results['linear_model'] = linear_model.evaluate(
		test_features, test_labels, verbose=0
	)

	# Plot the regression line
	# Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart
	#x = tf.linspace(0.0, 250, 251)
	#y = linear_model.predict(x)
	#plot_horsepower(x, y, train_features, train_labels)


if __name__ == '__main__':
	regression_single_input()
	regression_multiple_inputs()