Source code for inforcehub.anonymize
import hashlib
import bcrypt
import pandas as pd
[docs]class Anonymize:
"""
A class that will transform a Pandas dataframe into an anonymized
dataframe using the python hmac package for encryption.
When instantiating an object of this class, the **salt**
init attribute can be specified which enables encryption results
to be reproduced. If none is supplied, a randomized password
is created instead for security.
:param: str salt: an optional passphase - if empty, salt will be randomized
"""
def __init__(self, salt=None):
""""""
if salt:
# Convert to bytestring
self.salt = str(salt).encode()
else:
self.salt = bcrypt.gensalt()
[docs] def encode(self, text):
"""
Provides the encryption of a single number, text or date
:param: text: the text, value or date to be encrypted
:returns: the hexidecimal encrypted value
:rtype: str
"""
return hashlib.md5(self.salt + str(text).encode()).hexdigest()
[docs] def transform(self, df, columns, verbose=False):
"""
Encrypts the selected columns in a dataframe and returns
an anonymized dataframe.
If required, also returns a dataframe of column pairs showing
the encrypted and original data to be used as a
pseudo-anonymization key.
:param: pd.DataFrame() df: A Pandas dataframe to be transformed
:param: list columns: A list of columns to be transformed
:param: str verbose: (default=False) If true will print status
:returns: a pseudo-anonymization lookup table
:rtype: pd.DataFrame()
"""
# In case user puts in single column name as a string not a list
if isinstance(columns, str):
columns = [columns]
if verbose:
print("Will convert columns: %s" % ", ".join(columns))
print("Encrypting %i rows per column ...\n" % df.shape[0])
lookup = pd.DataFrame()
for column in columns:
if column not in df.columns:
raise Exception("Column %s cannot be found in dataframe" % column)
lookup[column] = df[column]
df[column] = df[column].map(lambda x: self.encode(x))
lookup[column + "_"] = df[column]
if verbose:
print("Finished encrypting column %s" % column)
return lookup