import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('inraw_tycobib13_16.csv')
print(df.shape)
df.head()
# generate id for join
df['tycobib'] = df.bibNumber.astype('str') + df.collcode + df.itemtype
biba = pd.read_csv('tycobib_unique.csv')
print(biba.shape)
biba.head()
biba = biba.drop(['Unnamed: 0'], axis=1)
biba.head()
biba.loc[biba.tycobib=='2511808ncbocdjccd']
merger = pd.merge(df, biba[['ItemLocation', 'tycobib', 'PublicationYear']], on='tycobib', how='left')
print(merger.shape)
merger.head()
dur = merger[['bibNumber','barcode','cout', 'cin', 'PublicationYear', 'deweyClass', 'ItemLocation', 'tycobib']]
dur.dtypes
# check missing values in PublicationYear
dur.loc[dur.PublicationYear.isnull()].shape
dur['cout'] = pd.to_datetime(dur.cout)
dur['cin'] = pd.to_datetime(dur.cin)
dur['duration'] = (dur.cin - dur.cout).astype('timedelta64[D]')
print(dur.shape)
dur.head()
dur5 = dur.loc[dur.duration<=2000,]
print(dur5.shape)
dur5.duration.max()
fig, ax = plt.subplots(figsize=(16,6))
sns.distplot(dur5.duration, ax = ax)
dur5['month_out'] = dur5.cout.dt.month
dur5['month_in'] = dur5.cin.dt.month
dur5['hh_out'] = dur5.cout.dt.hour
dur5['hh_in'] = dur5.cin.dt.hour
dur5['dow_out'] = dur5.cout.dt.dayofweek
dur5['dow_in'] = dur5.cin.dt.dayofweek
dur5.head()
dur5['dew'] = dur5.deweyClass.fillna(0).astype('int')
dur5.head()
dur5.cout.min()
#dur5[dur5.cout.dt.year!=2012]
dur5['dd_from_cout'] = (dur5.cout - dur5.cout.min()).astype('timedelta64[D]')
dur5['dd_from_cin'] = (dur5.cin - dur5.cout.min()).astype('timedelta64[D]')
dur5.head(10)
# count null dewey
dur5.loc[dur5.deweyClass.isnull(),].shape
print(dur5.dd_from_cin.max())
print(dur5.duration.max())
print(np.log(dur5.duration.max()))
print(dur5.dew.nunique())
print(dur5.deweyClass.nunique())
print(dur5.dd_from_cin.max())
# sort by location and create id_key
#dur5 = dur5.sort_values(by=['ItemLocation'])
# generate id variable
#dur5.insert(0, 'branch_order', range(0, len(dur5)))
#
#dur5.head(10)
# sort by location and create id_key
#dur5 = dur5.sort_values(by=['bibNumber'])
# generate id variable
#dur5.insert(0, 'bib_order', range(0, len(dur5)))
#
#dur5.head(10)
#dur5 = dur5.sort_values(by = ['cout'])
# generate id variable
#dur5.insert(0, 'date_order', range(0, len(dur5)))
#dur5.head()
# check if some of the fields are unique
dur5['ranker'] = dur5.barcode.rank(method='dense')
print(dur5.ranker.max())
dur5.loc[dur5.barcode==10061216916,]
# check if ranker and barcode have identical number of unqiue
print(dur5.ranker.nunique())
print(dur5.barcode.nunique())
print(dur5.ranker.max())
#
dur5.ItemLocation.value_counts()
dur5.sort_values(by='ItemLocation', inplace=True)
dur5.tail(10)
# generate numeric category for coloring later
dur5['col'] = dur5.ItemLocation.astype('category')
dur5['col_code'] = dur5.col.cat.codes.astype('int64')
dur5.drop('col', axis=1, inplace=True)
print(dur5.col_code.max())
dur5.head()
# get borders for branches
dur5.loc[dur5.ItemLocation=='cen']
dur5['ordered_idx'] = dur5.groupby(['ItemLocation', 'barcode']).ngroup()
print(dur5.ordered_idx.nunique())
print(dur5.ordered_idx.max())
dur5.groupby(['ItemLocation', 'col_code']).size().sort_values(ascending=False)
dur5.to_csv('prep_ink.csv')
dur5.iloc[:5000,].to_csv('prep_ink2.csv')