Python pandas 模块,to_pickle() 实例源码
我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用pandas.to_pickle()。
def compute_cell_smushing(self):
"""Within each plate,find a 2d embedding of all cells"""
grouped = self.genes.groupby(self.cell_Metadata[self.SAMPLE_MAPPING])
if os.path.exists(self.cell_smushed_cache_file):
smusheds = pd.read_pickle(self.cell_smushed_cache_file)
# if nothing is missing,return the cached version
if not set(grouped.groups) - set(smusheds):
return smusheds
else:
smusheds = {}
for plate_name, genes_subset in grouped:
if plate_name not in smusheds:
cell_smusher = TSNE(metric='cosine', random_state=0)
cell_smushed = pd.DataFrame(
cell_smusher.fit_transform(genes_subset),
index=genes_subset.index)
smusheds[plate_name] = cell_smushed
pd.to_pickle(smusheds, self.cell_smushed_cache_file)
return smusheds
def train(features):
X, Y = ordered_dict_to_x_y(features)
pd.DataFrame(X).to_csv('features.csv')
clf = get_classification()
clf.fit(X, Y)
pd.to_pickle(clf, 'classification.pkl')
return clf
def save_dataset(dataset, output_path):
""" Save the whole dataset as pickle file
:param dataset: pandas DataFrame
:param output_path: path adn file name of output
"""
logging.info('Saving dataset to pickle file :' + output_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
pandas.to_pickle(dataset, output_path)
def round_trip_pickle(self, obj, path=None):
if path is None:
path = u('__%s__.pickle' % rands(10))
with ensure_clean(path) as path:
pd.to_pickle(obj, path)
return pd.read_pickle(path)
# https://docs.python.org/3/library/unittest.html#deprecated-aliases
def optimizer(strategyclass,portfolioclass,Feed,params_generator,pkl_name=None):
log = {}
if pkl_name is None:
pkl_name = 'optimizer_log'
pkl_path = os.path.join(sys.path[0],'%s.pkl' % pkl_name)
pd.to_pickle(log, pkl_path)
while True:
try:
p_list = params_generator.next()
except:
break
else:
backup = copy.deepcopy(Feed)
data = backup
strategy = strategyclass(data,p_list)
portfolio = portfolioclass(data)
go = OnePiece(data, strategy, portfolio)
def combine():
go.sunny()
print p_list
log = pd.read_pickle(pkl_path)
log[p_list] = go.get_all_holdings().iat[-1,-1]
pd.to_pickle(log, pkl_path)
p = multiprocessing.Process(target=combine)
p.daemon=True
p.start()
p.join()
def tushare_clean(csv_path, override=True, pickle_name=None):
"""
1. save to local csv
2. save to local pickle
"""
def clean(df):
df.reset_index(drop=True, inplace=True)
df['date'] = pd.DatetimeIndex(df['date'])
df.set_index('date', inplace=True)
return df
walk_list = os.walk(csv_path).next()
csv_list=[]
pickle_dict ={}
for i in walk_list[2]:
if 'csv' in i:
df = pd.read_csv(os.path.join(csv_path, '%s' % i),
parse_dates=True,index_col=0)
cleaned_df = clean(df)
# override CSV
if override:
cleaned_df.to_csv(os.path.join(csv_path, '%s' % i))
# create pickle
if type(pickle_name) is str:
symbol = i.replace('.csv','')
pickle_dict[symbol] = cleaned_df
# Save to pickle
if type(pickle_name) is str:
pd.to_pickle(pickle_dict, os.path.join(csv_path, '%s.pkl' % pickle_name))
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'):
svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
svd.fit(train)
pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
return 'Success'
# In[3]:
def toTsne(train,n_component=2,path='data/'):
tsne=TSNE(n_components=n_component,njobs=-1)
lentrain=train.shape[0]
X=np.vstack([train,test])
tsne.fit(X)
res=tsne.embedding_
#print res
pd.to_pickle(res[:lentrain],path+'train_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl')
pd.to_pickle(res[lentrain:],path+'test_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl')
return 'Success'
def svd(train,dims=6,path='data/'):
svd=NMF(random_state=1123,n_components=dims)
svd.fit(train)
#print svd.transform(train).shape
pd.to_pickle(svd.transform(train),path+'train_NMF_'+str(dims)+'_'+file_name+'.pkl')
pd.to_pickle(svd.transform(test),path+'test_NMF_'+str(dims)+'_'+file_name+'.pkl')
return 'Success'
# In[16]:
def save_dict(self,outpath='./data/dictionary/'):
pd.to_pickle(self.word_index, outpath + 'word_index.pkl')
pd.to_pickle(self.index_word, outpath + 'index_word.pkl')
if self.mode!='word':
pd.to_pickle(self.char_index,outpath+'char_index.pkl')
pd.to_pickle(self.index_char,outpath+'index_char.pkl')
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
n = X.shape[0]
'''
Fit Metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
'''
print clf
for epoch in range(nb_epoch):
print "Start epoch:",epoch
mf_tr = np.zeros(X.shape[0])
mf_te = np.zeros(X_test.shape[0])
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)
for ind_tr, ind_te in skf:
X_tr = X[ind_tr]
X_te = X[ind_te]
y_tr = y[ind_tr]
y_te = y[ind_te]
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te).ravel()
score = accuracy_score(y_te, clf.predict(X_te).ravel())
del X_tr
del X_te
mf_te += clf.predict_proba(X_test).ravel()
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch))
pd.to_pickle(mf_te.reshape(-1,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def make_mf_regression(X , y_tr)
mf_tr[ind_te] += clf.predict(X_te)
del X_tr
del X_te
l = 600000
y_pred = []
for batch in range(4):
X_tmp = X_test[l*batch:l*(batch+1)]
y_pred.append(clf.predict(X_tmp))
y_pred = np.concatenate(y_pred)
mf_te += y_pred
score = log_loss(y_te, mf_tr[ind_te])
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr,epoch))
pd.to_pickle(mf_te,epoch))
def expand_file_data(infile, pitcher):
infile_data = pd.read_pickle(infile)
outfile_data = expand_mlb_data(infile_data=infile_data, pitcher=pitcher)
pd.to_pickle(outfile_data, outfile)
return outfile_data
def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly):
expanded_data = strip_and_process_na(pd.read_pickle(infile), na_strategy)
train_example_count = int(len(expanded_data.index) * trainpct / 100.0)
if split_randomly:
train_indices = np.random.choice(expanded_data.index, size=train_example_count)
else:
train_indices = expanded_data.sort("Date").index[:train_example_count]
train_data = expanded_data.ix[train_indices]
test_data = expanded_data.drop(train_indices)
pd.to_pickle(train_data, train)
pd.to_pickle(test_data, test)
def serialize(cls, formatted_data, fh):
# compat: if pandas is old,to_pickle does not accept file handles
if LooseVersion(pd.__version__) <= LooseVersion('0.20.3'):
fh.close()
fh = fh.name
return pd.to_pickle(formatted_data, fh)
def test_round_trip_current(self):
try:
import cPickle as c_pickle
def c_pickler(obj, path):
with open(path, 'wb') as fh:
c_pickle.dump(obj, fh, protocol=-1)
def c_unpickler(path):
with open(path, 'rb') as fh:
fh.seek(0)
return c_pickle.load(fh)
except:
c_pickler = None
c_unpickler = None
import pickle as python_pickle
def python_pickler(obj, path):
with open(path, 'wb') as fh:
python_pickle.dump(obj, protocol=-1)
def python_unpickler(path):
with open(path, 'rb') as fh:
fh.seek(0)
return python_pickle.load(fh)
for typ, dv in self.data.items():
for dt, expected in dv.items():
for writer in [pd.to_pickle, c_pickler, python_pickler]:
if writer is None:
continue
with tm.ensure_clean(self.path) as path:
# test writing with each pickler
writer(expected, path)
# test reading with each unpickler
result = pd.read_pickle(path)
self.compare_element(result, expected, typ)
if c_unpickler is not None:
result = c_unpickler(path)
self.compare_element(result, typ)
result = python_unpickler(path)
self.compare_element(result, typ)
def make_mf_classification(X ,epoch
mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
skf = StratifiedKFold(n_splits=n_folds, ind_te in skf:
X_tr = X[ind_tr]
X_te = X[ind_te]
y_tr = y[ind_tr]
y_te = y[ind_te]
if ssp.issparse(X):
clf.fit(X_tr.tocsc(), y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te.tocsc())
else:
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te)
del X_tr
del X_te
l = 600000
y_pred = []
for batch in range(4):
if batch!=3:
X_tmp = X_test[l*batch:l*(batch+1)]
else:
X_tmp = X_test[l*batch:]
if ssp.issparse(X):
y_pred.append(clf.predict_proba(X_tmp.tocsc()))
else:
y_pred.append(clf.predict_proba(X_tmp))
y_pred = np.vstack(y_pred)
mf_te += y_pred
score = log_loss(y_te,epoch))
def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False):
"""
Dump NBA statistical data to a file.
:param str outfile: name of file to become pickled pandas datafile
:param str start_date: don't include games from before this date when dumping data
:param str end_date: don't include games from after this date when dumping data
:param int max_count: maximum # of rows to dump
:param bool use_random: whether to select rows at random (if False,choose most recent)
:return:
"""
if start_date:
start_date = parser.parse(start_date)
else:
start_date = datetime.datetime(2010, 10, 1)
if end_date:
end_date = parser.parse(end_date)
else:
end_date = datetime.datetime.today()
print 'Dump NBA data for %s to %s' % (start_date, end_date)
print 'loading data...'
all_game_rows = load_all_game_data()
# Filter by date
if start_date is not None:
all_game_rows = all_game_rows[all_game_rows['date'] > start_date]
if end_date is not None:
all_game_rows = all_game_rows[all_game_rows['date'] < end_date]
# Sample filtered data
if max_count and max_count < len(all_game_rows):
print 'sampling %d rows...' % max_count
if use_random:
# We seed to 0 when we call this from CLI to make sure that random splits are replicable.
random.seed(0)
kept_indices = random.sample(all_game_rows.index, max_count)
selected = all_game_rows.loc[kept_indices]
else:
all_game_rows.sort("date")
selected = all_game_rows.tail(max_count)
else:
selected = all_game_rows
print 'saving...'
pandas.to_pickle(selected, outfile)
print 'Done!'
return selected
def dump_mlb_data(outfile, use_random=False, datatype='batting'):
"""
Dump MLB statistical data to a file.
:param str outfile: name of file to become pickled pandas datafile
:param str start_date: don't include games from before this date when dumping data
:param str end_date: don't include games from after this date when dumping data
:param int max_count: maximum # of rows to dump
:param bool use_random: whether to select rows at random (if False,choose most recent)
:return:
"""
print 'Dump MLB data for', datatype
print 'loading data...'
all_bsbr_logs = load_gamelogs(datatype=datatype)
unindexed_dfs = []
print 'reindexing data...'
pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.ETA()])
for player_id, dataframe in pbar(all_bsbr_logs.items()):
uidf = dataframe.reset_index()
# Add player ID as a column to the dataframe for future joining purposes!
uidf['player_id'] = pandas.Series(data=player_id, index=uidf.index)
unindexed_dfs.append(uidf)
all_game_rows = pandas.concat(unindexed_dfs, ignore_index=True)
# Filter by date
if start_date is not None:
all_game_rows = all_game_rows[all_game_rows['Date'] > start_date]
if end_date is not None:
all_game_rows = all_game_rows[all_game_rows['Date'] < end_date]
# Don't use relief pitchers in our dataset
if datatype == 'pitching':
print 'restricting to starting pitchers only...'
all_game_rows = all_game_rows[all_game_rows['player_id'].apply(brefid_is_starting_pitcher)]
# Sample filtered data
if max_count and max_count < len(all_game_rows):
print 'sampling %d rows...' % max_count
if use_random:
kept_indices = random.sample(all_game_rows.index, max_count)
selected = all_game_rows.iloc[kept_indices]
else:
all_game_rows.sort("Date")
selected = all_game_rows.tail(max_count)
else:
selected = all_game_rows
print 'saving...'
pandas.to_pickle(selected, outfile)
print 'Done!'
return selected
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。