Initial commit

This commit is contained in:
Patrick Müller 2021-12-24 20:23:59 +01:00
commit 71a065e52c
4 changed files with 461 additions and 0 deletions

157
.gitignore vendored Normal file
View File

@ -0,0 +1,157 @@
### IntelliJ files
.idea/
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/

16
main.py Normal file
View File

@ -0,0 +1,16 @@
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/

282
regression.py Normal file
View File

@ -0,0 +1,282 @@
"""
Linear Regression with Tensorflow following this tutorial: https://www.tensorflow.org/tutorials/keras/regression
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Makes numpy stuff easier readable
np.set_printoptions(precision=3, suppress=True)
def load_data():
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(url, names=column_names,
na_values='?', comment='\t',
sep=' ', skipinitialspace=True)
return raw_dataset
def plot_loss(history):
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Error [MPG]')
plt.legend()
plt.grid(True)
plt.show()
def plot_horsepower(x, y, train_features, train_labels):
plt.scatter(train_features['Horsepower'], train_labels, label='Data')
plt.plot(x, y, color='k', label='Predictions')
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.legend()
plt.show()
def regression_single_input():
raw_dataset = load_data()
dataset = raw_dataset.copy()
#################################
### Prepare the training data ###
#################################
# Print the end of the dataset
print(dataset.tail())
# Check for any undefined values
print(dataset.isna().sum())
# Drops unknown values
dataset = dataset.dropna()
# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# Split into training and test data
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
# Print pair plots of the data to see if there are any probable correlations
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
plt.show()
# Print statistics about the features
print(train_dataset.describe().transpose())
# Separate the label from the features
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
###########################
### Normalize the model ###
###########################
# Create the normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
# Fit the preprocessing layer to the data
normalizer.adapt(np.array(train_features))
# See what effect the normalization has
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
#######################################
### Start with the regression stuff ###
#######################################
# Create Horsepower NP array and normalize it
horsepower = np.array(train_features['Horsepower'])
horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
horsepower_normalizer.adapt(horsepower)
# Build the Keras Sequential Model
# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
horsepower_model = tf.keras.Sequential([
horsepower_normalizer,
layers.Dense(units=1)
])
print(horsepower_model.summary())
# Run the model without training
print(horsepower_model.predict(horsepower[:10]))
# Configure the training process
horsepower_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error'
)
# Train for 100 epochs
history = horsepower_model.fit(
train_features['Horsepower'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2
)
# Show training progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
# Show loss plot
plot_loss(history)
# Collect results on test set
test_results = {}
test_results['horsepower_model'] = horsepower_model.evaluate(
test_features['Horsepower'],
test_labels, verbose=0
)
# Plot the regression line
x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)
plot_horsepower(x, y, train_features, train_labels)
def regression_multiple_inputs():
raw_dataset = load_data()
dataset = raw_dataset.copy()
#################################
### Prepare the training data ###
#################################
# Print the end of the dataset
print(dataset.tail())
# Check for any undefined values
print(dataset.isna().sum())
# Drops unknown values
dataset = dataset.dropna()
# Replaces the alphanumerical values with numerical ones. Can be done via keras model, but is overkill for 3 values.
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# Split into training and test data
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
# Print pair plots of the data to see if there are any probable correlations
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')
plt.show()
# Print statistics about the features
print(train_dataset.describe().transpose())
# Separate the label from the features
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
###########################
### Normalize the model ###
###########################
# Create the normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
# Fit the preprocessing layer to the data
normalizer.adapt(np.array(train_features))
# See what effect the normalization has
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
#######################################
### Start with the regression stuff ###
#######################################
# Create Horsepower NP array and normalize it
horsepower = np.array(train_features['Horsepower'])
horsepower_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
horsepower_normalizer.adapt(horsepower)
# Build the Keras Sequential Model
# Apply a linear transformation to produce 1 output using a linear layer (layers.Dense)
linear_model = tf.keras.Sequential([
normalizer,
layers.Dense(units=1)
])
# Run the model without training
print(linear_model.predict(train_features[:10]))
# Configure the training process
linear_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error'
)
# Train for 100 epochs
history = linear_model.fit(
train_features,
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2
)
# Show training progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
# Show loss plot
plot_loss(history)
# Collect results on test set
test_results = {}
test_results['linear_model'] = linear_model.evaluate(
test_features, test_labels, verbose=0
)
# Plot the regression line
# Apparently, this doesnt work for multi inputs because i guess 3D plotting is kinda hart
#x = tf.linspace(0.0, 250, 251)
#y = linear_model.predict(x)
#plot_horsepower(x, y, train_features, train_labels)
if __name__ == '__main__':
regression_single_input()
regression_multiple_inputs()

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
scikit-learn
tensorflow
seaborn
matplotlib
numpy
pandas