Initial commit

2021-12-24 20:23:59 +01:00 · 2021-12-24 20:23:59 +01:00 · 71a065e52c
commit 71a065e52c
4 changed files with 461 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,157 @@
+### IntelliJ files
+.idea/
+
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
--- a/main.py
+++ b/main.py
@ -0,0 +1,16 @@
+# This is a sample Python script.
+
+# Press Shift+F10 to execute it or replace it with your code.
+# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+
+
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/
--- a/regression.py
+++ b/regression.py
@ -0,0 +1,282 @@
+"""
+Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+# Makes numpy stuff easier readable
+np.set_printoptions(precision=3, suppress=True)
+
+
+def load_data():
+	url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
+	column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
+					'Acceleration', 'Model Year', 'Origin']
+
+	raw_dataset = pd.read_csv(url, names=column_names,
+							  na_values='?', comment='\t',
+							  sep=' ', skipinitialspace=True)
+
+	return raw_dataset
+
+
+def plot_loss(history):
+	plt.plot(history.history['loss'], label='loss')
+	plt.plot(history.history['val_loss'], label='val_loss')
+	plt.ylim([0, 10])
+	plt.xlabel('Epoch')
+	plt.ylabel('Error [MPG]')
+	plt.legend()
+	plt.grid(True)
+	plt.show()
+
+
+def plot_horsepower(x, y, train_features, train_labels):
+	plt.scatter(train_features['Horsepower'], train_labels, label='Data')
+	plt.plot(x, y, color='k', label='Predictions')
+	plt.xlabel('Horsepower')
+	plt.ylabel('MPG')
+	plt.legend()
+	plt.show()
+
+def regression_single_input():
+	raw_dataset = load_data()
+	dataset = raw_dataset.copy()
+
+	#################################
+	### Prepare the training data ###
+	#################################
+
+	# Print the end of the dataset
+	print(dataset.tail())
+
+	# Check for any undefined values
+	print(dataset.isna().sum())
+
+	# Drops unknown values
+	dataset = dataset.dropna()
+
+	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
+	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
+	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
+
+	# Split into training and test data
+	train_dataset = dataset.sample(frac=0.8, random_state=0)
+	test_dataset = dataset.drop(train_dataset.index)
+
+	# Print pair plots of the data to see if there are any probable correlations
+	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
+	plt.show()
+
+	# Print statistics about the features
+	print(train_dataset.describe().transpose())
+
+	# Separate the label from the features
+	train_features = train_dataset.copy()
+	test_features = test_dataset.copy()
+
+	train_labels = train_features.pop('MPG')
+	test_labels = test_features.pop('MPG')
+
+	###########################
+	### Normalize the model ###
+	###########################
+
+	# Create the normalization layer
+	normalizer = tf.keras.layers.Normalization(axis=-1)
+
+	# Fit the preprocessing layer to the data
+	normalizer.adapt(np.array(train_features))
+
+	# See what effect the normalization has
+	first = np.array(train_features[:1])
+
+	with np.printoptions(precision=2, suppress=True):
+		print('First example:', first)
+		print()
+		print('Normalized:', normalizer(first).numpy())
+
+	#######################################
+	### Start with the regression stuff ###
+	#######################################
+
+	# Create Horsepower NP array and normalize it
+	horsepower = np.array(train_features['Horsepower'])
+
+	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
+	horsepower_normalizer.adapt(horsepower)
+
+	# Build the Keras Sequential Model
+	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
+	horsepower_model = tf.keras.Sequential([
+		horsepower_normalizer,
+		layers.Dense(units=1)
+	])
+
+	print(horsepower_model.summary())
+
+	# Run the model without training
+	print(horsepower_model.predict(horsepower[:10]))
+
+	# Configure the training process
+	horsepower_model.compile(
+		optimizer=tf.optimizers.Adam(learning_rate=0.1),
+		loss='mean_absolute_error'
+	)
+
+	# Train for 100 epochs
+	history = horsepower_model.fit(
+		train_features['Horsepower'],
+		train_labels,
+		epochs=100,
+		# Suppress logging.
+		verbose=0,
+		# Calculate validation results on 20% of the training data.
+		validation_split=0.2
+	)
+
+	# Show training progress
+	hist = pd.DataFrame(history.history)
+	hist['epoch'] = history.epoch
+	print(hist.tail())
+
+	# Show loss plot
+	plot_loss(history)
+
+	# Collect results on test set
+	test_results = {}
+
+	test_results['horsepower_model'] = horsepower_model.evaluate(
+		test_features['Horsepower'],
+		test_labels, verbose=0
+	)
+
+	# Plot the regression line
+	x = tf.linspace(0.0, 250, 251)
+	y = horsepower_model.predict(x)
+	plot_horsepower(x, y, train_features, train_labels)
+
+def regression_multiple_inputs():
+	raw_dataset = load_data()
+	dataset = raw_dataset.copy()
+
+	#################################
+	### Prepare the training data ###
+	#################################
+
+	# Print the end of the dataset
+	print(dataset.tail())
+
+	# Check for any undefined values
+	print(dataset.isna().sum())
+
+	# Drops unknown values
+	dataset = dataset.dropna()
+
+	# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
+	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
+	dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
+
+	# Split into training and test data
+	train_dataset = dataset.sample(frac=0.8, random_state=0)
+	test_dataset = dataset.drop(train_dataset.index)
+
+	# Print pair plots of the data to see if there are any probable correlations
+	sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
+	plt.show()
+
+	# Print statistics about the features
+	print(train_dataset.describe().transpose())
+
+	# Separate the label from the features
+	train_features = train_dataset.copy()
+	test_features = test_dataset.copy()
+
+	train_labels = train_features.pop('MPG')
+	test_labels = test_features.pop('MPG')
+
+	###########################
+	### Normalize the model ###
+	###########################
+
+	# Create the normalization layer
+	normalizer = tf.keras.layers.Normalization(axis=-1)
+
+	# Fit the preprocessing layer to the data
+	normalizer.adapt(np.array(train_features))
+
+	# See what effect the normalization has
+	first = np.array(train_features[:1])
+
+	with np.printoptions(precision=2, suppress=True):
+		print('First example:', first)
+		print()
+		print('Normalized:', normalizer(first).numpy())
+
+	#######################################
+	### Start with the regression stuff ###
+	#######################################
+
+	# Create Horsepower NP array and normalize it
+	horsepower = np.array(train_features['Horsepower'])
+
+	horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
+	horsepower_normalizer.adapt(horsepower)
+
+	# Build the Keras Sequential Model
+	# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
+	linear_model = tf.keras.Sequential([
+		normalizer,
+		layers.Dense(units=1)
+	])
+
+	# Run the model without training
+	print(linear_model.predict(train_features[:10]))
+
+	# Configure the training process
+	linear_model.compile(
+		optimizer=tf.optimizers.Adam(learning_rate=0.1),
+		loss='mean_absolute_error'
+	)
+
+	# Train for 100 epochs
+	history = linear_model.fit(
+		train_features,
+		train_labels,
+		epochs=100,
+		# Suppress logging.
+		verbose=0,
+		# Calculate validation results on 20% of the training data.
+		validation_split=0.2
+	)
+
+	# Show training progress
+	hist = pd.DataFrame(history.history)
+	hist['epoch'] = history.epoch
+	print(hist.tail())
+
+	# Show loss plot
+	plot_loss(history)
+
+	# Collect results on test set
+	test_results = {}
+
+	test_results['linear_model'] = linear_model.evaluate(
+		test_features, test_labels, verbose=0
+	)
+
+	# Plot the regression line
+	# Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart
+	#x = tf.linspace(0.0, 250, 251)
+	#y = linear_model.predict(x)
+	#plot_horsepower(x, y, train_features, train_labels)
+
+
+if __name__ == '__main__':
+	regression_single_input()
+	regression_multiple_inputs()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+scikit-learn
+tensorflow
+seaborn
+matplotlib
+numpy
+pandas