In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
In [2]:
df = pd.read_csv('inraw_tycobib13_16.csv')
print(df.shape)
df.head()
(130003, 12)
Out[2]:
id itemNumber bibNumber cout cin collcode itemtype barcode title callNumber deweyClass subj
0 60914084 3565426 2597502 2012-12-31 13:19:00 2012-12-31 13:29:00 nacd accd 10067444736 Afro CD 782.42165 G4125A 782.42165 NaN
1 60921659 3565426 2597502 2012-12-31 13:29:00 2012-12-31 16:05:00 nacd accd 10067444736 Afro CD 782.42165 G4125A 782.42165 NaN
2 60921893 3206079 2511808 2012-12-31 16:09:00 2012-12-31 16:09:00 ncbocd jccd 10061359948 Little Bear audio collection CDJ M6611L NaN NaN
3 60928589 4465799 2802835 2012-12-31 12:08:00 2013-01-02 11:06:00 cacd accd 10066685651 Little broken hearts CD 782.42166 J722L 782.42166 NaN
4 60928738 2787258 2412543 2012-12-31 15:52:00 2013-01-02 11:11:00 cacd accd 10056618860 You are my lady CD 782.42166 W317Y 782.42166 NaN
In [3]:
# generate id for join 
df['tycobib'] = df.bibNumber.astype('str') + df.collcode + df.itemtype
In [4]:
biba = pd.read_csv('tycobib_unique.csv')
print(biba.shape) 
biba.head()
(10900, 16)
Out[4]:
Unnamed: 0 BibNum Title Author ISBN PublicationYear Publisher Subjects ItemType ItemCollection FloatingItem ItemLocation ReportDate ItemCount locbib tycobib
0 0 261 American place names, by Alfred H. Holt. New Y... Holt, Alfred Hubbard, 1897- NaN 1969. Gale Research Co., Names Geographical United States arbk cs9r NaN cen 09/01/2017 1 cen26109/01/2017cs9rarbk 261cs9rarbk
1 1 813 Central Park country, a tune within us; photos... Johnston, Nancy NaN [1968] Sierra Club Central Park New York N Y acbk canf NaN cen 09/01/2017 1 cen81309/01/2017canfacbk 813canfacbk
2 2 2524 The Virginia dynasties; the emergence of "King... Dowdey, Clifford, 1904-1979. NaN [1969] Little, Brown Carter Robert 1663 1732, Upper class Virginia,... acbk canf NaN cen 09/01/2017 1 cen252409/01/2017canfacbk 2524canfacbk
3 3 3119 The light princess. With pictures by Maurice S... MacDonald, George, 1824-1905 NaN [1969] Farrar, Straus and Giroux Fairy tales, Fairy tales England Juvenile fiction jcbk ncfic NaN wts 09/01/2017 1 wts311909/01/2017ncficjcbk 3119ncficjcbk
4 4 4439 An encyclopaedic dictionary of heraldry, by Ju... Franklyn, Julian, 1899-1970 0080132979 [1969] Pergamon Press Heraldry Dictionaries arbk cs9r NaN cen 09/01/2017 1 cen443909/01/2017cs9rarbk 4439cs9rarbk
In [5]:
biba = biba.drop(['Unnamed: 0'], axis=1)
biba.head()
Out[5]:
BibNum Title Author ISBN PublicationYear Publisher Subjects ItemType ItemCollection FloatingItem ItemLocation ReportDate ItemCount locbib tycobib
0 261 American place names, by Alfred H. Holt. New Y... Holt, Alfred Hubbard, 1897- NaN 1969. Gale Research Co., Names Geographical United States arbk cs9r NaN cen 09/01/2017 1 cen26109/01/2017cs9rarbk 261cs9rarbk
1 813 Central Park country, a tune within us; photos... Johnston, Nancy NaN [1968] Sierra Club Central Park New York N Y acbk canf NaN cen 09/01/2017 1 cen81309/01/2017canfacbk 813canfacbk
2 2524 The Virginia dynasties; the emergence of "King... Dowdey, Clifford, 1904-1979. NaN [1969] Little, Brown Carter Robert 1663 1732, Upper class Virginia,... acbk canf NaN cen 09/01/2017 1 cen252409/01/2017canfacbk 2524canfacbk
3 3119 The light princess. With pictures by Maurice S... MacDonald, George, 1824-1905 NaN [1969] Farrar, Straus and Giroux Fairy tales, Fairy tales England Juvenile fiction jcbk ncfic NaN wts 09/01/2017 1 wts311909/01/2017ncficjcbk 3119ncficjcbk
4 4439 An encyclopaedic dictionary of heraldry, by Ju... Franklyn, Julian, 1899-1970 0080132979 [1969] Pergamon Press Heraldry Dictionaries arbk cs9r NaN cen 09/01/2017 1 cen443909/01/2017cs9rarbk 4439cs9rarbk
In [6]:
biba.loc[biba.tycobib=='2511808ncbocdjccd']
Out[6]:
BibNum Title Author ISBN PublicationYear Publisher Subjects ItemType ItemCollection FloatingItem ItemLocation ReportDate ItemCount locbib tycobib
4952 2511808 Little Bear audio collection [sound recording]... Minarik, Else Holmelund 0061227439, 9780061227431 2007. Harper Children's Audio, Bears Juvenile fiction, Mothers and sons Juven... jccd ncbocd Floating dth 09/01/2017 1 dth251180809/01/2017ncbocdjccd 2511808ncbocdjccd
In [7]:
merger = pd.merge(df, biba[['ItemLocation', 'tycobib', 'PublicationYear']], on='tycobib', how='left')
print(merger.shape)
(130003, 15)
In [8]:
merger.head()
Out[8]:
id itemNumber bibNumber cout cin collcode itemtype barcode title callNumber deweyClass subj tycobib ItemLocation PublicationYear
0 60914084 3565426 2597502 2012-12-31 13:19:00 2012-12-31 13:29:00 nacd accd 10067444736 Afro CD 782.42165 G4125A 782.42165 NaN 2597502nacdaccd col c2002.
1 60921659 3565426 2597502 2012-12-31 13:29:00 2012-12-31 16:05:00 nacd accd 10067444736 Afro CD 782.42165 G4125A 782.42165 NaN 2597502nacdaccd col c2002.
2 60921893 3206079 2511808 2012-12-31 16:09:00 2012-12-31 16:09:00 ncbocd jccd 10061359948 Little Bear audio collection CDJ M6611L NaN NaN 2511808ncbocdjccd dth 2007.
3 60928589 4465799 2802835 2012-12-31 12:08:00 2013-01-02 11:06:00 cacd accd 10066685651 Little broken hearts CD 782.42166 J722L 782.42166 NaN 2802835cacdaccd cen p2012.
4 60928738 2787258 2412543 2012-12-31 15:52:00 2013-01-02 11:11:00 cacd accd 10056618860 You are my lady CD 782.42166 W317Y 782.42166 NaN 2412543cacdaccd cen p2007.
In [9]:
dur = merger[['bibNumber','barcode','cout', 'cin', 'PublicationYear', 'deweyClass', 'ItemLocation', 'tycobib']]
dur.dtypes
Out[9]:
bibNumber            int64
barcode              int64
cout                object
cin                 object
PublicationYear     object
deweyClass         float64
ItemLocation        object
tycobib             object
dtype: object
In [10]:
# check missing values in PublicationYear 
dur.loc[dur.PublicationYear.isnull()].shape
Out[10]:
(17, 8)
In [11]:
dur['cout'] = pd.to_datetime(dur.cout)
dur['cin'] = pd.to_datetime(dur.cin)
C:\Users\noibar\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
C:\Users\noibar\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [12]:
dur['duration'] = (dur.cin - dur.cout).astype('timedelta64[D]')
print(dur.shape)
dur.head()
(130003, 9)
C:\Users\noibar\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Out[12]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration
0 2597502 10067444736 2012-12-31 13:19:00 2012-12-31 13:29:00 c2002. 782.42165 col 2597502nacdaccd 0.0
1 2597502 10067444736 2012-12-31 13:29:00 2012-12-31 16:05:00 c2002. 782.42165 col 2597502nacdaccd 0.0
2 2511808 10061359948 2012-12-31 16:09:00 2012-12-31 16:09:00 2007. NaN dth 2511808ncbocdjccd 0.0
3 2802835 10066685651 2012-12-31 12:08:00 2013-01-02 11:06:00 p2012. 782.42166 cen 2802835cacdaccd 1.0
4 2412543 10056618860 2012-12-31 15:52:00 2013-01-02 11:11:00 p2007. 782.42166 cen 2412543cacdaccd 1.0
In [13]:
dur5 = dur.loc[dur.duration<=2000,]
print(dur5.shape)
(130003, 9)
In [14]:
dur5.duration.max()
Out[14]:
1286.0
In [15]:
fig, ax = plt.subplots(figsize=(16,6)) 

sns.distplot(dur5.duration, ax = ax)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x19492ea4088>
In [16]:
dur5['month_out'] = dur5.cout.dt.month
dur5['month_in'] = dur5.cin.dt.month
dur5['hh_out'] = dur5.cout.dt.hour
dur5['hh_in'] = dur5.cin.dt.hour
dur5['dow_out'] = dur5.cout.dt.dayofweek
dur5['dow_in'] = dur5.cin.dt.dayofweek
dur5.head()
Out[16]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in
0 2597502 10067444736 2012-12-31 13:19:00 2012-12-31 13:29:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 13 0 0
1 2597502 10067444736 2012-12-31 13:29:00 2012-12-31 16:05:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 16 0 0
2 2511808 10061359948 2012-12-31 16:09:00 2012-12-31 16:09:00 2007. NaN dth 2511808ncbocdjccd 0.0 12 12 16 16 0 0
3 2802835 10066685651 2012-12-31 12:08:00 2013-01-02 11:06:00 p2012. 782.42166 cen 2802835cacdaccd 1.0 12 1 12 11 0 2
4 2412543 10056618860 2012-12-31 15:52:00 2013-01-02 11:11:00 p2007. 782.42166 cen 2412543cacdaccd 1.0 12 1 15 11 0 2
In [17]:
dur5['dew'] = dur5.deweyClass.fillna(0).astype('int')
dur5.head()
Out[17]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew
0 2597502 10067444736 2012-12-31 13:19:00 2012-12-31 13:29:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 13 0 0 782
1 2597502 10067444736 2012-12-31 13:29:00 2012-12-31 16:05:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 16 0 0 782
2 2511808 10061359948 2012-12-31 16:09:00 2012-12-31 16:09:00 2007. NaN dth 2511808ncbocdjccd 0.0 12 12 16 16 0 0 0
3 2802835 10066685651 2012-12-31 12:08:00 2013-01-02 11:06:00 p2012. 782.42166 cen 2802835cacdaccd 1.0 12 1 12 11 0 2 782
4 2412543 10056618860 2012-12-31 15:52:00 2013-01-02 11:11:00 p2007. 782.42166 cen 2412543cacdaccd 1.0 12 1 15 11 0 2 782
In [18]:
dur5.cout.min()

#dur5[dur5.cout.dt.year!=2012]
Out[18]:
Timestamp('2012-12-31 10:12:00')
In [19]:
dur5['dd_from_cout'] = (dur5.cout - dur5.cout.min()).astype('timedelta64[D]')
dur5['dd_from_cin'] = (dur5.cin - dur5.cout.min()).astype('timedelta64[D]')
dur5.head(10)
Out[19]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew dd_from_cout dd_from_cin
0 2597502 10067444736 2012-12-31 13:19:00 2012-12-31 13:29:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 13 0 0 782 0.0 0.0
1 2597502 10067444736 2012-12-31 13:29:00 2012-12-31 16:05:00 c2002. 782.42165 col 2597502nacdaccd 0.0 12 12 13 16 0 0 782 0.0 0.0
2 2511808 10061359948 2012-12-31 16:09:00 2012-12-31 16:09:00 2007. NaN dth 2511808ncbocdjccd 0.0 12 12 16 16 0 0 0 0.0 0.0
3 2802835 10066685651 2012-12-31 12:08:00 2013-01-02 11:06:00 p2012. 782.42166 cen 2802835cacdaccd 1.0 12 1 12 11 0 2 782 0.0 2.0
4 2412543 10056618860 2012-12-31 15:52:00 2013-01-02 11:11:00 p2007. 782.42166 cen 2412543cacdaccd 1.0 12 1 15 11 0 2 782 0.0 2.0
5 2104088 10060644258 2012-12-31 12:18:00 2013-01-02 11:17:00 [2000] NaN cen 2104088cadvdacdvd 1.0 12 1 12 11 0 2 0 0.0 2.0
6 2805686 10077128204 2012-12-31 15:06:00 2013-01-02 12:51:00 c2010. NaN hip 2805686nadvdacdvd 1.0 12 1 15 12 0 2 0 0.0 2.0
7 2574375 10063041668 2012-12-31 17:37:00 2013-01-02 13:19:00 c2008. 791.43750 cap 2574375nadvdnfacdvd 1.0 12 1 17 13 0 2 791 0.0 2.0
8 2845574 10077886033 2012-12-31 16:49:00 2013-01-02 15:32:00 [2012], c2010. NaN cen 2845574ccdvdacdvd 1.0 12 1 16 15 0 2 0 0.0 2.0
9 2461515 10059927268 2013-01-02 15:35:00 2013-01-02 16:01:00 2007. NaN cap 2461515ncficjcbk 0.0 1 1 15 16 2 2 0 2.0 2.0
In [20]:
# count null dewey
dur5.loc[dur5.deweyClass.isnull(),].shape
Out[20]:
(75529, 18)
In [21]:
print(dur5.dd_from_cin.max())
print(dur5.duration.max())
print(np.log(dur5.duration.max()))
1709.0
1286.0
7.1592919047975645
In [22]:
print(dur5.dew.nunique())
print(dur5.deweyClass.nunique())
print(dur5.dd_from_cin.max())
316
1331
1709.0
In [23]:
# sort by location and create id_key
#dur5 = dur5.sort_values(by=['ItemLocation'])

# generate id variable 
#dur5.insert(0, 'branch_order', range(0, len(dur5)))

# 
#dur5.head(10)
In [24]:
# sort by location and create id_key
#dur5 = dur5.sort_values(by=['bibNumber'])

# generate id variable 
#dur5.insert(0, 'bib_order', range(0, len(dur5)))

# 
#dur5.head(10)
In [25]:
#dur5 = dur5.sort_values(by = ['cout'])

# generate id variable 
#dur5.insert(0, 'date_order', range(0, len(dur5)))

#dur5.head()
In [26]:
# check if some of the fields are unique 
dur5['ranker'] = dur5.barcode.rank(method='dense')
print(dur5.ranker.max())
12601.0
In [27]:
dur5.loc[dur5.barcode==10061216916,]
Out[27]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew dd_from_cout dd_from_cin ranker
854 2502889 10061216916 2012-12-31 10:12:00 2013-01-19 16:18:00 [2008] NaN net 2502889ncdvdjcdvd 19.0 12 1 10 16 0 5 0 0.0 19.0 2642.0
4160 2502889 10061216916 2013-02-11 11:24:00 2013-02-21 15:29:00 [2008] NaN net 2502889ncdvdjcdvd 10.0 2 2 11 15 0 3 0 42.0 52.0 2642.0
6103 2502889 10061216916 2013-03-04 11:18:00 2013-03-11 10:43:00 [2008] NaN net 2502889ncdvdjcdvd 6.0 3 3 11 10 0 0 0 63.0 70.0 2642.0
8721 2502889 10061216916 2013-03-21 13:15:00 2013-04-03 12:34:00 [2008] NaN net 2502889ncdvdjcdvd 12.0 3 4 13 12 3 2 0 80.0 93.0 2642.0
12748 2502889 10061216916 2013-05-02 16:56:00 2013-05-09 17:45:00 [2008] NaN net 2502889ncdvdjcdvd 7.0 5 5 16 17 3 3 0 122.0 129.0 2642.0
15504 2502889 10061216916 2013-05-19 16:42:00 2013-06-05 18:14:00 [2008] NaN net 2502889ncdvdjcdvd 17.0 5 6 16 18 6 2 0 139.0 156.0 2642.0
17452 2502889 10061216916 2013-06-20 10:22:00 2013-06-23 15:20:00 [2008] NaN net 2502889ncdvdjcdvd 3.0 6 6 10 15 3 6 0 171.0 174.0 2642.0
33615 2502889 10061216916 2013-11-08 13:08:00 2013-11-20 12:06:00 [2008] NaN net 2502889ncdvdjcdvd 11.0 11 11 13 12 4 2 0 312.0 324.0 2642.0
44909 2502889 10061216916 2014-02-18 18:11:00 2014-03-12 17:18:00 [2008] NaN net 2502889ncdvdjcdvd 21.0 2 3 18 17 1 2 0 414.0 436.0 2642.0
79565 2502889 10061216916 2015-02-23 11:18:00 2015-03-04 18:03:00 [2008] NaN net 2502889ncdvdjcdvd 9.0 2 3 11 18 0 2 0 784.0 793.0 2642.0
99971 2502889 10061216916 2015-10-20 15:47:00 2015-11-01 15:41:00 [2008] NaN net 2502889ncdvdjcdvd 11.0 10 11 15 15 1 6 0 1023.0 1035.0 2642.0
103689 2502889 10061216916 2015-11-28 12:42:00 2015-12-19 12:24:00 [2008] NaN net 2502889ncdvdjcdvd 20.0 11 12 12 12 5 5 0 1062.0 1083.0 2642.0
116152 2502889 10061216916 2016-05-20 16:53:00 2016-06-06 19:22:00 [2008] NaN net 2502889ncdvdjcdvd 17.0 5 6 16 19 4 0 0 1236.0 1253.0 2642.0
121541 2502889 10061216916 2016-08-21 14:40:00 2016-08-28 13:45:00 [2008] NaN net 2502889ncdvdjcdvd 6.0 8 8 14 13 6 6 0 1329.0 1336.0 2642.0
In [28]:
# check if ranker and barcode have identical number of unqiue 
print(dur5.ranker.nunique())
print(dur5.barcode.nunique())
print(dur5.ranker.max())
12601
12601
12601.0
In [29]:
# 
dur5.ItemLocation.value_counts()
Out[29]:
cen    44777
net     8716
swt     7801
lcy     5458
bea     4374
bal     4303
dth     4258
nga     4218
gwd     4146
cap     4131
bro     4073
wts     4035
mag     3663
glk     2900
hip     2862
col     2624
rbe     2507
fre     2309
qna     1976
wal     1968
uni     1951
mgm     1706
spa     1574
nhy     1065
dlr      933
mon      701
idc      638
mob      336
Name: ItemLocation, dtype: int64
In [30]:
dur5.sort_values(by='ItemLocation', inplace=True)

dur5.tail(10)
Out[30]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew dd_from_cout dd_from_cin ranker
73308 2973288 10082039123 2014-12-04 17:37:00 2014-12-26 14:17:00 2014 NaN wts 2973288ncnewjcbk 21.0 12 12 17 14 3 4 0 703.0 725.0 9654.0
47734 2569833 10063773633 2014-03-25 18:53:00 2014-04-09 11:41:00 2008. 641.59550 wts 2569833ncholjcbk 14.0 3 4 18 11 1 2 641 449.0 464.0 3179.0
47743 2159351 10045019238 2014-03-25 17:03:00 2014-04-09 12:52:00 2002. 398.20938 wts 2159351ncfftjcbk 14.0 3 4 17 12 1 2 398 449.0 464.0 832.0
47744 1331332 10041059410 2014-03-25 17:03:00 2014-04-09 12:52:00 c1993. 398.20000 wts 1331332ncfftjcbk 14.0 3 4 17 12 1 2 398 449.0 464.0 647.0
47746 2942023 10080999567 2014-04-03 13:05:00 2014-04-09 12:56:00 1994, c1993. NaN wts 2942023naficacbk 5.0 4 4 13 12 3 2 0 458.0 464.0 9080.0
12230 2830463 10077164944 2013-05-03 11:18:00 2013-05-05 14:14:00 c2011. 381.10974 wts 2830463nanfacbk 2.0 5 5 11 14 4 6 381 123.0 125.0 7326.0
47752 2692452 10072886434 2014-03-17 14:26:00 2014-04-09 13:36:00 c1992. 362.82920 wts 2692452nanfacbk 22.0 3 4 14 13 0 2 362 441.0 464.0 5659.0
124420 3004022 10082905018 2016-10-01 13:04:00 2016-10-15 12:38:00 2014. NaN wts 3004022ncnewjcbk 13.0 10 10 13 12 5 5 0 1370.0 1384.0 10194.0
30793 2443899 10059478064 2013-10-15 13:37:00 2013-10-23 13:58:00 c2007. 641.56360 wts 2443899nanfacbk 8.0 10 10 13 13 1 2 641 288.0 296.0 2345.0
103203 3122941 10085135464 2015-11-29 14:58:00 2015-12-12 17:29:00 [2014] NaN wts 3122941ncnewjcbk 13.0 11 12 14 17 6 5 0 1063.0 1076.0 10961.0
In [31]:
# generate numeric category for coloring later 
dur5['col'] = dur5.ItemLocation.astype('category')
dur5['col_code'] = dur5.col.cat.codes.astype('int64')
dur5.drop('col', axis=1, inplace=True)
print(dur5.col_code.max())
dur5.head()
27
Out[31]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew dd_from_cout dd_from_cin ranker col_code
49418 2767284 10076015741 2014-04-15 12:08:00 2014-04-25 15:44:00 c2012. NaN bal 2767284nynewacbk 10.0 4 4 12 15 1 4 0 470.0 480.0 6680.0 0
80113 2499454 10061163597 2015-02-14 11:19:00 2015-03-10 15:06:00 2008. NaN bal 2499454ncrdrjcbk 24.0 2 3 11 15 5 1 0 775.0 799.0 2629.0 0
43544 2843388 10076355485 2014-02-08 15:56:00 2014-02-28 12:53:00 2012. NaN bal 2843388naficacbk 19.0 2 2 15 12 5 4 0 404.0 424.0 6866.0 0
94599 2351245 10081178864 2015-08-12 10:50:00 2015-08-29 16:18:00 2006. 398.20972 bal 2351245ncnfjcbk 17.0 8 8 10 16 2 5 398 954.0 971.0 9184.0 0
57283 2838929 10076727410 2014-06-19 14:57:00 2014-07-13 15:20:00 [2012] NaN bal 2838929nadvdacdvd 24.0 6 7 14 15 3 6 0 535.0 559.0 7084.0 0
In [32]:
# get borders for branches 
dur5.loc[dur5.ItemLocation=='cen']
Out[32]:
bibNumber barcode cout cin PublicationYear deweyClass ItemLocation tycobib duration month_out month_in hh_out hh_in dow_out dow_in dew dd_from_cout dd_from_cin ranker col_code
41135 2940335 10080825226 2014-01-23 11:53:00 2014-02-04 13:36:00 c2013. 919.693040 cen 2940335canfacbk 12.0 1 2 11 13 3 1 919 388.0 400.0 9002.0 4
72686 2940602 10080826901 2014-12-12 15:52:00 2014-12-17 17:58:00 p2013. 782.421649 cen 2940602cacdaccd 5.0 12 12 15 17 4 2 782 711.0 716.0 9004.0 4
105195 2012888 10059609833 2016-01-06 17:28:00 2016-01-08 16:56:00 p1998. 782.421650 cen 2012888cacdaccd 1.0 1 1 17 16 2 4 782 1101.0 1103.0 2368.0 4
112195 3086048 10086385274 2016-03-30 17:43:00 2016-04-09 16:09:00 [2015] 741.595200 cen 3086048cycomicacbk 9.0 3 4 17 16 2 5 741 1185.0 1195.0 11433.0 4
72520 3004529 10083292580 2014-12-13 16:14:00 2014-12-16 10:38:00 [2013] 741.597300 cen 3004529cacomicacbk 2.0 12 12 16 10 5 1 741 712.0 715.0 10343.0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
24538 2663532 10068207686 2013-08-14 18:21:00 2013-08-24 14:21:00 p2010. 782.421660 cen 2663532cacdaccd 9.0 8 8 18 14 2 5 782 226.0 236.0 4274.0 4
83765 1274903 10041377366 2015-04-04 14:30:00 2015-04-23 13:11:00 c1991. 979.500490 cen 1274903ccnfjcbk 18.0 4 4 14 13 5 3 979 824.0 843.0 667.0 4
80855 2459616 10059783505 2015-03-14 12:27:00 2015-03-18 15:58:00 p2004. 782.421660 cen 2459616cacdaccd 4.0 3 3 12 15 5 2 782 803.0 807.0 2382.0 4
119022 2111939 10043035632 2016-06-28 17:06:00 2016-07-19 16:54:00 2002. 788.921650 cen 2111939canfacbk 20.0 6 7 17 16 1 1 788 1275.0 1296.0 733.0 4
119012 1798498 10032506114 2016-06-28 18:08:00 2016-07-19 15:58:00 c1997. NaN cen 1798498ccpicjcbk 20.0 6 7 18 15 1 1 0 1275.0 1296.0 247.0 4

44777 rows × 20 columns

In [33]:
dur5['ordered_idx'] = dur5.groupby(['ItemLocation', 'barcode']).ngroup()
print(dur5.ordered_idx.nunique())
print(dur5.ordered_idx.max())
12603
12602
In [43]:
dur5.groupby(['ItemLocation', 'col_code']).size().sort_values(ascending=False)
Out[43]:
ItemLocation  col_code
cen           4           44777
net           18           8716
swt           24           7801
lcy           13           5458
bea           1            4374
bal           0            4303
dth           7            4258
nga           19           4218
gwd           10           4146
cap           3            4131
bro           2            4073
wts           27           4035
mag           14           3663
glk           9            2900
hip           11           2862
col           5            2624
rbe           22           2507
fre           8            2309
qna           21           1976
wal           26           1968
uni           25           1951
mgm           15           1706
spa           23           1574
nhy           20           1065
dlr           6             933
mon           17            701
idc           12            638
mob           16            336
dtype: int64
In [35]:
dur5.to_csv('prep_ink.csv')
In [36]:
dur5.iloc[:5000,].to_csv('prep_ink2.csv')