Initial commit

2021-12-24 20:23:59 +01:00
commit 71a065e52c
4 changed files with 461 additions and 0 deletions
@@ -0,0 +1,157 @@
 ### IntelliJ files
 .idea/
 ### VirtualEnv template
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 .Python
 [Bb]in
 [Ii]nclude
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
 pyvenv.cfg
 .venv
 pip-selfcheck.json
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
@@ -0,0 +1,16 @@
 # This is a sample Python script.
 # Press Shift+F10 to execute it or replace it with your code.
 # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
 def print_hi(name):
    # Use a breakpoint in the code line below to debug your script.
    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
 # Press the green button in the gutter to run the script.
 if __name__ == '__main__':
    print_hi('PyCharm')
 # See PyCharm help at https://www.jetbrains.com/help/pycharm/
@@ -0,0 +1,282 @@
 """
 Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression
 """
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import tensorflow as tf
 from tensorflow import keras
 from tensorflow.keras import layers
 # Makes numpy stuff easier readable
 np.set_printoptions(precision=3, suppress=True)
 def load_data():
 	url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
 	column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
 					'Acceleration', 'Model Year', 'Origin']
 	raw_dataset = pd.read_csv(url, names=column_names,
 							  na_values='?', comment='\t',
 							  sep=' ', skipinitialspace=True)
 	return raw_dataset
 def plot_loss(history):
 	plt.plot(history.history['loss'], label='loss')
 	plt.plot(history.history['val_loss'], label='val_loss')
 	plt.ylim([0, 10])
 	plt.xlabel('Epoch')
 	plt.ylabel('Error [MPG]')
 	plt.legend()
 	plt.grid(True)
 	plt.show()
 def plot_horsepower(x, y, train_features, train_labels):
 	plt.scatter(train_features['Horsepower'], train_labels, label='Data')
 	plt.plot(x, y, color='k', label='Predictions')
 	plt.xlabel('Horsepower')
 	plt.ylabel('MPG')
 	plt.legend()
 	plt.show()
 def regression_single_input():
 	raw_dataset = load_data()
 	dataset = raw_dataset.copy()
 	#################################
 	### Prepare the training data ###
 	#################################
 	# Print the end of the dataset
 	print(dataset.tail())
 	# Check for any undefined values
 	print(dataset.isna().sum())
 	# Drops unknown values
 	dataset = dataset.dropna()
 	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
 	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
 	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
 	# Split into training and test data
 	train_dataset = dataset.sample(frac=0.8, random_state=0)
 	test_dataset = dataset.drop(train_dataset.index)
 	# Print pair plots of the data to see if there are any probable correlations
 	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
 	plt.show()
 	# Print statistics about the features
 	print(train_dataset.describe().transpose())
 	# Separate the label from the features
 	train_features = train_dataset.copy()
 	test_features = test_dataset.copy()
 	train_labels = train_features.pop('MPG')
 	test_labels = test_features.pop('MPG')
 	###########################
 	### Normalize the model ###
 	###########################
 	# Create the normalization layer
 	normalizer = tf.keras.layers.Normalization(axis=-1)
 	# Fit the preprocessing layer to the data
 	normalizer.adapt(np.array(train_features))
 	# See what effect the normalization has
 	first = np.array(train_features[:1])
 	with np.printoptions(precision=2, suppress=True):
 		print('First example:', first)
 		print()
 		print('Normalized:', normalizer(first).numpy())
 	#######################################
 	### Start with the regression stuff ###
 	#######################################
 	# Create Horsepower NP array and normalize it
 	horsepower = np.array(train_features['Horsepower'])
 	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
 	horsepower_normalizer.adapt(horsepower)
 	# Build the Keras Sequential Model
 	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
 	horsepower_model = tf.keras.Sequential([
 		horsepower_normalizer,
 		layers.Dense(units=1)
 	])
 	print(horsepower_model.summary())
 	# Run the model without training
 	print(horsepower_model.predict(horsepower[:10]))
 	# Configure the training process
 	horsepower_model.compile(
 		optimizer=tf.optimizers.Adam(learning_rate=0.1),
 		loss='mean_absolute_error'
 	)
 	# Train for 100 epochs
 	history = horsepower_model.fit(
 		train_features['Horsepower'],
 		train_labels,
 		epochs=100,
 		# Suppress logging.
 		verbose=0,
 		# Calculate validation results on 20% of the training data.
 		validation_split=0.2
 	)
 	# Show training progress
 	hist = pd.DataFrame(history.history)
 	hist['epoch'] = history.epoch
 	print(hist.tail())
 	# Show loss plot
 	plot_loss(history)
 	# Collect results on test set
 	test_results = {}
 	test_results['horsepower_model'] = horsepower_model.evaluate(
 		test_features['Horsepower'],
 		test_labels, verbose=0
 	)
 	# Plot the regression line
 	x = tf.linspace(0.0, 250, 251)
 	y = horsepower_model.predict(x)
 	plot_horsepower(x, y, train_features, train_labels)
 def regression_multiple_inputs():
 	raw_dataset = load_data()
 	dataset = raw_dataset.copy()
 	#################################
 	### Prepare the training data ###
 	#################################
 	# Print the end of the dataset
 	print(dataset.tail())
 	# Check for any undefined values
 	print(dataset.isna().sum())
 	# Drops unknown values
 	dataset = dataset.dropna()
 	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
 	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
 	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
 	# Split into training and test data
 	train_dataset = dataset.sample(frac=0.8, random_state=0)
 	test_dataset = dataset.drop(train_dataset.index)
 	# Print pair plots of the data to see if there are any probable correlations
 	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
 	plt.show()
 	# Print statistics about the features
 	print(train_dataset.describe().transpose())
 	# Separate the label from the features
 	train_features = train_dataset.copy()
 	test_features = test_dataset.copy()
 	train_labels = train_features.pop('MPG')
 	test_labels = test_features.pop('MPG')
 	###########################
 	### Normalize the model ###
 	###########################
 	# Create the normalization layer
 	normalizer = tf.keras.layers.Normalization(axis=-1)
 	# Fit the preprocessing layer to the data
 	normalizer.adapt(np.array(train_features))
 	# See what effect the normalization has
 	first = np.array(train_features[:1])
 	with np.printoptions(precision=2, suppress=True):
 		print('First example:', first)
 		print()
 		print('Normalized:', normalizer(first).numpy())
 	#######################################
 	### Start with the regression stuff ###
 	#######################################
 	# Create Horsepower NP array and normalize it
 	horsepower = np.array(train_features['Horsepower'])
 	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
 	horsepower_normalizer.adapt(horsepower)
 	# Build the Keras Sequential Model
 	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
 	linear_model = tf.keras.Sequential([
 		normalizer,
 		layers.Dense(units=1)
 	])
 	# Run the model without training
 	print(linear_model.predict(train_features[:10]))
 	# Configure the training process
 	linear_model.compile(
 		optimizer=tf.optimizers.Adam(learning_rate=0.1),
 		loss='mean_absolute_error'
 	)
 	# Train for 100 epochs
 	history = linear_model.fit(
 		train_features,
 		train_labels,
 		epochs=100,
 		# Suppress logging.
 		verbose=0,
 		# Calculate validation results on 20% of the training data.
 		validation_split=0.2
 	)
 	# Show training progress
 	hist = pd.DataFrame(history.history)
 	hist['epoch'] = history.epoch
 	print(hist.tail())
 	# Show loss plot
 	plot_loss(history)
 	# Collect results on test set
 	test_results = {}
 	test_results['linear_model'] = linear_model.evaluate(
 		test_features, test_labels, verbose=0
 	)
 	# Plot the regression line
 	# Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart
 	#x = tf.linspace(0.0, 250, 251)
 	#y = linear_model.predict(x)
 	#plot_horsepower(x, y, train_features, train_labels)
 if __name__ == '__main__':
 	regression_single_input()
 	regression_multiple_inputs()
@@ -0,0 +1,6 @@
 scikit-learn
 tensorflow
 seaborn
 matplotlib
 numpy
 pandas