Source can be found in https://github.com/elvinado/Tidying-data
Notice that at the bottom of the excel there are intentional notes. These are to be excluded from tranformation
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import re
import openpyxl
!python --version
pd.__version__
np.version.version
matplotlib.__version__
re.__version__
openpyxl.__version__
Here we ignore the header by setting header=None so we can manually deal with them
# instantiate an excel object
xl = pd.ExcelFile("Untidy Data.xlsx")
# find out the all the available sheets
xl.sheet_names
config_a = xl.parse(xl.sheet_names[0],header=None)
config_b = xl.parse(xl.sheet_names[1],header=None)
config_a.head()
print(f"Config A shape: {config_a.shape}\nConfig B shape: {config_b.shape}")
Extra notes in Config A sheet causes it to have more rows.
# checking the last 5 rows of the dataframe
config_a.tail()
We'll make it better by limiting the rows during loading
# Alternative way to read excel. Here we limit to 21 rows
config_a = pd.read_excel("Untidy Data.xlsx","Config A",header=None,nrows=21)
config_a.tail()
A good reason to use regex pattern instead of extracting by hard coding is flexibility if any additional dimensions were to be added in the future dataset
Located at 0th row
temp = config_a.iloc[0].dropna().unique()
temp
# drop the first column
temp = temp[1:]
temp
pattern = "Material (\S+)"
# List comprehension searching the pattern in each values
materials = [re.search(pattern,t).group(1) for t in temp]
materials
Located at 1st row
temp = config_a.iloc[1].dropna().unique()
temp
# drop the first column
temp = temp[1:]
temp
pattern = "(\d+)\""
# List comprehension searching the pattern in each values
sizes = [re.search(pattern,t).group(1) for t in temp]
# convert to interger
sizes = [int(size) for size in sizes]
sizes
Located at 0th column
temp = config_a[0].dropna().unique()
temp
# only take the last 2 columns
temp = temp[-2:]
temp
pattern = "(\d+)hz"
# List comprehension searching the pattern in each values
frequencies = [re.search(pattern,t).group(1) for t in temp]
# convert to interger
frequencies = [int(f) for f in frequencies]
frequencies
Located at 2nd row
temp = config_a.iloc[2].dropna().unique()
temp
# drop the first column
temp = temp[1:]
temp
# for consitency purposes just convert the numpy array into python list
observations = temp.tolist()
observations
Located in the sheet names
configs = xl.sheet_names
configs
Our columns are going to be dimensions of material, frequency, size, configuration, as well as three observations units.
The observation units can later on be separated into 3 separate table if needed.
dimensions = ["Configuration","Material","Frequency","Size"]
df = pd.DataFrame(columns = dimensions + observations)
df.columns
# take a peak at the data we want to loop
config_a.head(15)[list(range(16))]
# Defining skip parameters unique to the source excel to allow us to do looping
col_skip = 1 # our first data is in col 1, row 3
row_skip = 3
row_frq_set = 9 # new frequency every 9 rows
col_obs_set = 3 # new size every 3 observation columns
col_mat_set = col_obs_set * len(sizes) # new material every 5 set of 5 observation
# try extract one set of data for example Config A, Material A, 100hz, 5"
temp_df = config_a.iloc[row_skip:row_skip + row_frq_set,col_skip:col_skip + col_obs_set].copy()
temp_df.columns = observations
temp_df
# assign all needed dimension to the df
temp_dimension = ["Config A", "A", 100, 5]
dimensions, temp_dimension
for i,j in zip(dimensions,temp_dimension): # using zip to combine two list
temp_df[i] = j
temp_df
# take a peak at the data we want to loop
config_a.head(15)[list(range(16))]
# these are 5 layers of for loops. Try understand the meaning of set values i.e. col_mat_set, col_obs_set, row_frq_set
print("These are for debugging purposes")
for config,source_df in zip(configs,[config_a,config_b]): # using zip to combine two list
print(f"\nConfiguration: {config}")
start_col = col_skip # reset start column each sheet
start_row = row_skip # reset start row each sheet
for imat,material in enumerate(materials): # using enumerate to automatically make list with index as first element
print(f"Material: {material}")
m_col = start_col + (col_mat_set * imat) # material repeating by columns, it is column's set of observation times number of sizes
for ifreq,frequency in enumerate(frequencies):
print(f"Frequency: {frequency}")
row = start_row + (row_frq_set * ifreq) # frequency repeating by rows, it is row's set of frequency
for isize,size in enumerate(sizes):
col = m_col + (col_obs_set * isize) #size repeating by column, it is column's set of observation
temp_df = source_df.iloc[row:row + row_frq_set,col:col + col_obs_set].copy()
print(f"Size: {size} Row {row}:{row + row_frq_set}, Col {col}:{col + col_obs_set}")
temp_df.columns = observations # renaming the columns
temp_dim = [config,material,frequency,size]
for key,value in zip(dimensions,temp_dim): # applying dimension to each observation row
temp_df[key] = value
df = df.append(temp_df,ignore_index=True) # Appending dataset into the designed dataframe
print(f"Finally have {len(df)} rows in the data frame")
# observation columns need to be converted to numeric
for o in observations:
df[o] = pd.to_numeric(df[o])
df.dtypes
df[dimensions].describe()
df[observations].describe()
df[observations].plot(title="This plot does not say anything meaningful")
for j,m in enumerate(materials,1):
for i,_ in enumerate(observations):
title = f"Material {m}: {observations[i]} vs {observations[i-1]} {i+1} {j}"
df[df["Material"] == m].plot(x=observations[i],y=observations[i-1],kind = "scatter",title=title)
fig,axes = plt.subplots(2,3,figsize=(12,8))
plt.subplots_adjust(wspace=0.3, hspace=0.4)
for j,m in enumerate(materials,1):
for i,_ in enumerate(observations):
title = f"Material {m}: {observations[i]} vs {observations[i-1]}"
df[df["Material"] == m].plot(x=observations[i],y=observations[i-1],kind = "scatter",title=title,ax=axes[j-1,i])
data = df[df["Material"] == materials[0]]
alpha = data.Alpha
beta = data.Beta
gamma = data.Gamma
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(alpha,beta,gamma, c='b', marker='o')
ax.set_xlabel('alpha')
ax.set_ylabel('beta')
ax.set_zlabel('gamma')
plt.show()
This operation requires Openpyxl as backend
# df.to_excel("Output.xlsx",sheet_name="Tidy",index=False) # Please uncomment to export out.
I believe 5 skills promised in the beginning have been demonstrated here.
Hopefully readers may benefit from this example.
Alvin Alexander is a marathon runner with a strong inclination for scripting & programming while working in oil & gas industry. Chemist by training, data management by trade, learner by vocation.
Also thanks to Airi for providing this original dataset for the original transformation
Copyright © 2020