博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python之prettytable
阅读量:6890 次
发布时间:2019-06-27

本文共 66650 字,大约阅读时间需要 222 分钟。

 

sdata={'语文':89,'数学':96,'音乐':39,'英语':78,'化学':88}#字典向Series转化>>> studata=Series(sdata)>>> studata化学    88数学    96英语    78语文    89音乐    39dtype: int64>>> obj=Series(sdata,index=['物理','数学','化学'])>>> obj物理     NaN[这个地方没有物理成绩所以是NaN,同时引起下面的数据是float型]数学    96.0化学    88.0dtype: float64#判断数据行中是否为空值>>> pd.isnull(obj)物理     True数学    False化学    Falsedtype: bool>>> pd.notnull(obj)物理    False数学     True化学     Truedtype: bool>>> obj.isnull()物理     True数学    False化学    Falsedtype: bool#对应数据相加>>> en=Series([84,94,51,81],index=['张三','李四','王五','赵六'])>>> sx=Series([94,81,31,91],index=['张三','赵六','王五','李四'])>>> en+sx   [相加时候索引自动对其]张三    178李四    185王五     82赵六    162dtype: int64#Series 的name 属性>>> en.name='英语成绩'>>> en张三    84李四    94王五    51赵六    81Name: 英语成绩, dtype: int64>>> en.index.name='姓名'>>> en姓名张三    84李四    94王五    51赵六    81Name: 英语成绩, dtype: int64#索引是可以修改的>>> en.index=['zs','ll','ww','zl']>>> enzs    84ll    94ww    51zl    81Name: 英语成绩, dtype: int64#############DataFrame##############>>> data={	'name':['张三','张三','张三','李四','李四','李四'],	'year':[2001,2002,2003,2001,2002,2003],	'weight':[54,50,60,61,63,65],}>>> frame=DataFrame(data)>>> frame  name  weight  year0   张三      54  20011   张三      50  20022   张三      60  20033   李四      61  20014   李四      63  20025   李四      65  2003#columns可以修改显示顺序和选项>>> DataFrame(data,columns=['year','weight','name'])   year  weight name0  2001      54   张三1  2002      50   张三2  2003      60   张三3  2001      61   李四4  2002      63   李四5  2003      65   李四>>> DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])       year  weight name  sexone    2001      54   张三  NaNtwo    2002      50   张三  NaNthree  2003      60   张三  NaNfour   2001      61   李四  NaNfive   2002      63   李四  NaNfive   2003      65   李四  NaN#索引相同的情况查询,获取某一行或者几行>>> a.ix['five']      year  weight name  sexfive  2002      63   李四  NaNfive  2003      65   李四  NaN#DataFrame-->Series 降维#获取某一列>>> info=DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])>>> info['name']                                                                one      张三two      张三three    张三four     李四five     李四five     李四Name: name, dtype: object#列赋值>>> info['sex']='男'>>> info       year  weight name sexone    2001      54   张三   男two    2002      50   张三   男three  2003      60   张三   男four   2001      61   李四   男five   2002      63   李四   男five   2003      65   李四   男#列赋值-列值局部赋值>>> val=Series(['man','woman','man'],index=['two','four','five'])>>> info['sex']=val>>> info       year  weight name    sexone    2001      54   张三    NaNtwo    2002      50   张三    manthree  2003      60   张三    NaNfour   2001      61   李四  womanfive   2002      63   李四    manfive   2003      65   李四    man#为不存在的列创建并赋值>>> info['sexflag']=info.sex=='man'>>> info       year  weight name    sex sexflagone    2001      54   张三    NaN   Falsetwo    2002      50   张三    man    Truethree  2003      60   张三    NaN   Falsefour   2001      61   李四  woman   Falsefive   2002      63   李四    man    Truefive   2003      65   李四    man    True#删除某一个列>>> del info['sex']>>> info       year  weight name sexflagone    2001      54   张三   Falsetwo    2002      50   张三    Truethree  2003      60   张三   Falsefour   2001      61   李四   Falsefive   2002      63   李四    Truefive   2003      65   李四    True#嵌套字典-----convert--->DataFrame#外层的key是列;内层的key是行>>> studata={'张三':{'语文':91,'数学':99,'物理':90},'李四':{'语文':31,'数学':65,'物理':45}}>>> info2=DataFrame(studata)>>> info2    张三  李四数学  99  65物理  90  45语文  91  31>>> info2.T    数学  物理  语文张三  99  90  91李四  65  45  31#index.name columns.name 属性>>> info       year  weight name sexflagone    2001      54   张三   Falsetwo    2002      50   张三    Truethree  2003      60   张三   Falsefour   2001      61   李四   Falsefive   2002      63   李四    Truefive   2003      65   李四    True>>> info.index.name='个人信息'>>> info.columns.name='索引'>>> info索引 year  weight name sexflag个人信息                            one    2001      54   张三   Falsetwo    2002      50   张三    Truethree  2003      60   张三   Falsefour   2001      61   李四   Falsefive   2002      63   李四    Truefive   2003      65   李四    True>>> info.indexIndex([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'个人信息')#集合去重复>>> info.index.unique
>>> info.index.unique()array(['one', 'two', 'three', 'four', 'five'], dtype=object)#是否唯一>>> info.index.is_uniqueFalse#当各元素均大于等于前一个元素时候,返回True>>> DataFrame(range(1,4),index=range(1,4)).index.is_monotonicTrue>>> info.index.is_monotonicFalse#删除传入的值并得到新的index>>> DataFrame(range(1,4),index=range(1,4)).index.drop(1)Int64Index([2, 3], dtype='int64')>>> obj=Series([33,23],index=['a','b'])>>> obja 33b 23dtype: int64>>> obj2=obj.reindex(['b','a','c'])>>> obj2b 23.0a 33.0c NaNdtype: float64>>> obj2=obj.reindex(['b','a','c'],fill_value=0)>>> obj2b 23a 33c 0dtype: int64>>> obj3=Series(['blue','purple','yellow'],index=[0,2,4])>>> obj30 blue2 purple4 yellowdtype: object#ffill前向值填充>>> obj3.reindex(range(6),method='ffill')0 blue1 blue2 purple3 purple4 yellow5 yellowdtype: object#bfill后向填充>>> obj3.reindex(range(6),method='bfill')0 blue1 purple2 purple3 yellow4 yellow5 NaNdtype: object>>> frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','d'],columns=['Ohio','Texas','california'])>>> frame Ohio Texas californiaa 0 1 2b 3 4 5d 6 7 8#重新索引行>>> frame2=frame.reindex(['a','b','c','d'])>>> frame2 Ohio Texas californiaa 0.0 1.0 2.0b 3.0 4.0 5.0c NaN NaN NaNd 6.0 7.0 8.0#重新索引列>>> cols=['Texas','Ohio','uknown']>>> frame.reindex(columns=cols) Texas Ohio uknowna 1 0 NaNb 4 3 NaNd 7 6 NaN>>> frame.reindex(index=['a','b','c','d'],method='ffill',columns=cols) Texas Ohio uknowna 1 0 NaNb 4 3 NaNc 4 3 NaNd 7 6 NaN>>> frame.ix[['a','b','c','d'],cols] Texas Ohio uknowna 1.0 0.0 NaNb 4.0 3.0 NaNc NaN NaN NaNd 7.0 6.0 NaN>>> data Texas Ohio uknowna 1.0 0.0 NaNb 4.0 3.0 NaNc NaN NaN NaNd 7.0 6.0 NaN#删除行>>> data.drop(['c','b']) Texas Ohio uknowna 1.0 0.0 NaNd 7.0 6.0 NaN>>> data.drop('uknown',axis=1) Texas Ohioa 1.0 0.0b 4.0 3.0c NaN NaNd 7.0 6.0#列的条件查询>>> info[info['weight']>60]索引 year weight name sexflag个人信息 four 2001 61 李四 Falsefive 2002 63 李四 Truefive 2003 65 李四 True#>>> info.ix['one',['name','year']]索引name 张三year 2001Name: one, dtype: object>>> data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],columns=['one','two','three','four'])>>> data one two three fourOhio 0 1 2 3Colorado 4 5 6 7Utah 8 9 10 11NewYork 12 13 14 15>>> data['two']Ohio 1Colorado 5Utah 9NewYork 13Name: two, dtype: int64>>> data[['three','one']] three oneOhio 2 0Colorado 6 4Utah 10 8NewYork 14 12>>> >>> data[:2] one two three fourOhio 0 1 2 3Colorado 4 5 6 7>>> data[data['three']>5] one two three fourColorado 4 5 6 7Utah 8 9 10 11NewYork 12 13 14 15>>> data<5 one two three fourOhio True True True TrueColorado True False False FalseUtah False False False FalseNewYork False False False False>>> data[data<5]=0>>> data one two three fourOhio 0 0 0 0Colorado 0 5 6 7Utah 8 9 10 11NewYork 12 13 14 15#行列组合查询>>> data.ix['Colorado',['two','three']]two 5three 6Name: Colorado, dtype: int64>>> data.ix[['Colorado','Utah'],[3,0,1]] four one twoColorado 7 0 5Utah 11 8 9>>> data.ix[:'Utah','two']Ohio 0Colorado 5Utah 9Name: two, dtype: int64>>> >>> data.ix[data.three>5,:3] one two threeColorado 0 5 6Utah 8 9 10NewYork 12 13 14#obj[val] 选取DataFrame的单个列或一组列。在一些特殊情况下会比较便利#obj.ix[val] 选取DataFrame的单个行或一组行#obj.ix[:,val] 选取单个列或列子集#obj.ix[val1,val2] 同时选取行和列#reindex 新索引#DataFrame的数据对齐>>> df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['good','bad','normal'])>>> df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','normal','bad','supper'])>>> df1 b c dgood 0.0 1.0 2.0bad 3.0 4.0 5.0normal 6.0 7.0 8.0>>> df2 b d egood 0.0 1.0 2.0normal 3.0 4.0 5.0bad 6.0 7.0 8.0supper 9.0 10.0 11.0>>> df1+df2 b c d ebad 9.0 NaN 12.0 NaNgood 0.0 NaN 3.0 NaNnormal 9.0 NaN 12.0 NaNsupper NaN NaN NaN NaN#没有的值使用0填充>>> df1.add(df2,fill_value=0) b c d ebad 9.0 4.0 12.0 8.0good 0.0 1.0 3.0 2.0normal 9.0 7.0 12.0 5.0supper 9.0 NaN 10.0 11.0#索引reindex 的填充>>> df1.reindex(columns=df2.columns,fill_value=0) b d egood 0.0 2.0 0bad 3.0 5.0 0normal 6.0 8.0 0#其他的算术方法:add +sub -div /mul *DataFrame和Series的运算>>> frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','bad','supper','uknown'])>>> frame b d egood 0.0 1.0 2.0bad 3.0 4.0 5.0supper 6.0 7.0 8.0uknown 9.0 10.0 11.0>>> series=frame.ix[0]>>> seriesb 0.0d 1.0e 2.0Name: good, dtype: float64>>> >>> frame-series b d egood 0.0 0.0 0.0bad 3.0 3.0 3.0supper 6.0 6.0 6.0uknown 9.0 9.0 9.0#frame 和 serie运算出现广播现象>>> series2=Series(range(3),index=[list('bef')])>>> series2b 0e 1f 2dtype: int64>>> frame+series2 b d e fgood 0.0 NaN 3.0 NaNbad 3.0 NaN 6.0 NaNsupper 6.0 NaN 9.0 NaNuknown 9.0 NaN 12.0 NaN#在列上广播>>> frame.sub(series3,axis=0) b d egood -1.0 0.0 1.0bad -1.0 0.0 1.0supper -1.0 0.0 1.0uknown -1.0 0.0 1.0>>> frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['good','bad','nice','supper'])>>> frame b d egood 0.428420 -0.951975 0.862226bad -0.666254 -0.988423 2.442255nice 1.617591 0.377867 -1.069077supper -1.417150 0.449853 0.685007#全部转换成正数>>> np.abs(frame) b d egood 0.428420 0.951975 0.862226bad 0.666254 0.988423 2.442255nice 1.617591 0.377867 1.069077supper 1.417150 0.449853 0.685007>>> f=lambda x: x.max()-x.min()>>> frame.apply(f,axis=0)b 3.034740d 1.438276e 3.511332dtype: float64>>> frame.apply(f,axis=1)good 1.814201bad 3.430677nice 2.686668supper 2.102157dtype: float64>>> def f(x):return Series([x.min(),x.max()],index=['min','max'])... >>> frame.apply(f) b d emin -1.417150 -0.988423 -1.069077max 1.617591 0.449853 2.442255#格式化内容>>> format=lambda x:'%.2f' % x>>> frame.applymap(format) b d egood 0.43 -0.95 0.86bad -0.67 -0.99 2.44nice 1.62 0.38 -1.07supper -1.42 0.45 0.69#############排序和排名##############ascending 升序还是降>>> frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=[list('nalv')])>>> frame n a l vthree 0 1 2 3one 4 5 6 7>>> frame.sort_index() n a l vone 4 5 6 7three 0 1 2 3>>> frame.sort_index(axis=1) a l n vthree 1 2 0 3one 5 6 4 7>>> frame.sort_index(axis=1,ascending=False) v n l athree 3 0 2 1one 7 4 6 5>>> obj=Series([4,5,-3,2])>>> obj.order()2 -33 20 41 5dtype: int64#指定列v倒叙排>>> frame.sort_index(axis=0,ascending=False,by='v') n a l vone 4 5 6 7three 0 1 2 3>>> frame.sort_index(axis=0,ascending=False,by=['v','l']) n a l vone 4 5 6 7three 0 1 2 3>>> obj=Series([7,-5,7,4,2,0,4])>>> obj.rank(method='first')0 6.01 1.02 7.03 4.04 3.05 2.06 5.0dtype: float64>>> obj.rank(ascending=False,method='max')0 2.01 7.02 2.03 4.04 5.05 6.06 4.0dtype: float64>>> DataFrame(studata).T 数学 物理 语文张三 99 90 91李四 65 45 31>>> DataFrame(studata).T.rank(axis=1,ascending=False) 数学 物理 语文张三 1.0 3.0 2.0李四 1.0 2.0 3.0>>> DataFrame(studata).T.rank(axis=0,ascending=False) 数学 物理 语文张三 1.0 1.0 1.0李四 2.0 2.0 2.0>>> datastu=pd.read_csv('/Users/similarface/Downloads/jnn.csv')>>> datastu 准考证号 姓名 班级 语文 数学 英语 化学 物理0 304040250124 罗茜 1 101.0 94 102.5 79 741 304040250128 沈怡君 1 91.5 96 69.0 82 692 304040250321 魏华 2 74.0 28 42.0 56 563 304040250233 何仕林 2 60.5 42 34.5 49 464 304040250725 屈妮 5 93.5 63 77.5 55 665 304040250709 邓培蓓 5 102.5 81 47.0 65 586 304040250805 郑清霞 5 89.0 80 63.5 63 657 304040250827 明杨 6 108.5 92 79.0 89 838 304040250819 李倩 6 93.5 61 44.0 45 329 304040250912 江明悦 6 0.0 0 0.0 0 0>>> datastu.rank(axis=1,ascending=False,method='min') 准考证号 姓名 班级 语文 数学 英语 化学 物理0 2.0 1.0 8.0 4.0 5.0 3.0 6.0 7.01 2.0 1.0 8.0 4.0 3.0 6.0 5.0 6.02 2.0 1.0 8.0 3.0 7.0 6.0 4.0 4.03 2.0 1.0 8.0 3.0 6.0 7.0 4.0 5.04 2.0 1.0 8.0 3.0 6.0 4.0 7.0 5.05 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.06 2.0 1.0 8.0 3.0 4.0 6.0 7.0 5.07 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.08 2.0 1.0 8.0 3.0 4.0 6.0 5.0 7.09 2.0 1.0 3.0 4.0 4.0 4.0 4.0 4.0>>> datastu.rank(axis=0,ascending=False,method='min') 准考证号 姓名 班级 语文 数学 英语 化学 物理0 10.0 4.0 9.0 3.0 2.0 1.0 3.0 2.01 9.0 5.0 9.0 6.0 1.0 4.0 2.0 3.02 7.0 1.0 7.0 8.0 9.0 8.0 6.0 7.03 8.0 10.0 7.0 9.0 8.0 9.0 8.0 8.04 5.0 9.0 4.0 4.0 6.0 3.0 7.0 4.05 6.0 3.0 4.0 2.0 4.0 6.0 4.0 6.06 4.0 2.0 4.0 7.0 5.0 5.0 5.0 5.07 2.0 8.0 1.0 1.0 3.0 2.0 1.0 1.08 3.0 7.0 1.0 4.0 7.0 7.0 9.0 9.09 1.0 6.0 1.0 10.0 10.0 10.0 10.0 10.0>>> data=datastu[['语文','数学','物理','英语','化学']]>>> data 语文 数学 物理 英语 化学0 101.0 94 74 102.5 791 91.5 96 69 69.0 822 74.0 28 56 42.0 563 60.5 42 46 34.5 494 93.5 63 66 77.5 555 102.5 81 58 47.0 656 89.0 80 65 63.5 637 108.5 92 83 79.0 898 93.5 61 32 44.0 459 0.0 0 0 0.0 0>>> data.sum()语文 814.0数学 637.0物理 549.0英语 559.0化学 583.0dtype: float64>>> data.sum(axis=1)0 450.51 407.52 256.03 232.04 355.05 353.56 360.57 451.58 275.59 0.0dtype: float64#axis#skipna 排除缺失值NAN#level >>> data 语文 数学 物理 英语 化学0 101.0 94 74 102.5 791 91.5 96 69 69.0 822 74.0 28 56 42.0 563 60.5 42 46 34.5 494 93.5 63 66 77.5 555 102.5 81 58 47.0 656 89.0 80 65 63.5 637 108.5 92 83 79.0 898 93.5 61 32 44.0 459 0.0 0 0 0.0 0#返回间接统计>>> data.idxmax()语文 7 最高分数的索引在7数学 1 最高分数的索引在1物理 7 最高分数的索引在7英语 0 最高分数的索引在0化学 7 最高分数的索引在7dtype: int64#累和>>> data.cumsum() 语文 数学 物理 英语 化学0 101.0 94.0 74.0 102.5 79.01 192.5 190.0 143.0 171.5 161.02 266.5 218.0 199.0 213.5 217.03 327.0 260.0 245.0 248.0 266.04 420.5 323.0 311.0 325.5 321.05 523.0 404.0 369.0 372.5 386.06 612.0 484.0 434.0 436.0 449.07 720.5 576.0 517.0 515.0 538.08 814.0 637.0 549.0 559.0 583.09 814.0 637.0 549.0 559.0 583.0>>> data.describe() 语文 数学 物理 英语 化学count 10.000000 10.00000 10.000000 10.000000 10.000000mean 81.400000 63.70000 54.900000 55.900000 58.300000std 31.857146 31.86447 24.052951 28.670349 25.117723min 0.000000 0.00000 0.000000 0.000000 0.00000025% 77.750000 46.75000 48.500000 42.500000 50.50000050% 92.500000 71.50000 61.500000 55.250000 59.50000075% 99.125000 89.25000 68.250000 75.375000 75.500000max 108.500000 96.00000 83.000000 102.500000 89.000000'''DataFrame.abs() Return an object with absolute value taken–only applicable to objects that are all numeric.DataFrame.all([axis, bool_only, skipna, level]) Return whether all elements are True over requested axisDataFrame.any([axis, bool_only, skipna, level]) Return whether any element is True over requested axisDataFrame.clip([lower, upper, out, axis]) Trim values at input threshold(s).DataFrame.clip_lower(threshold[, axis]) Return copy of the input with values below given value(s) truncated.DataFrame.clip_upper(threshold[, axis]) Return copy of input with values above given value(s) truncated.DataFrame.corr([method, min_periods]) Compute pairwise correlation of columns, excluding NA/null valuesDataFrame.corrwith(other[, axis, drop]) Compute pairwise correlation between rows or columns of two DataFrame objects.DataFrame.count([axis, level, numeric_only]) Return Series with number of non-NA/null observations over requested axis.DataFrame.cov([min_periods]) Compute pairwise covariance of columns, excluding NA/null valuesDataFrame.cummax([axis, dtype, out, skipna]) Return cumulative max over requested axis.DataFrame.cummin([axis, dtype, out, skipna]) Return cumulative min over requested axis.DataFrame.cumprod([axis, dtype, out, skipna]) Return cumulative prod over requested axis.DataFrame.cumsum([axis, dtype, out, skipna]) Return cumulative sum over requested axis.DataFrame.describe([percentiles, include, ...]) Generate various summary statistics, excluding NaN values.一阶差分(时间序列很有用)DataFrame.diff([periods, axis]) 1st discrete difference of objectDataFrame.eval(expr[, inplace]) Evaluate an expression in the context of the calling DataFrame instance.样本的峰度(四阶矩)DataFrame.kurt([axis, skipna, level, ...]) Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).平均绝对离差DataFrame.mad([axis, skipna, level]) Return the mean absolute deviation of the values for the requested axisDataFrame.max([axis, skipna, level, ...]) This method returns the maximum of the values in the object.DataFrame.mean([axis, skipna, level, ...]) Return the mean of the values for the requested axisDataFrame.median([axis, skipna, level, ...]) Return the median of the values for the requested axisDataFrame.min([axis, skipna, level, ...]) This method returns the minimum of the values in the object.DataFrame.mode([axis, numeric_only]) Gets the mode(s) of each element along the axis selected.百分数变化DataFrame.pct_change([periods, fill_method, ...]) Percent change over given number of periods.DataFrame.prod([axis, skipna, level, ...]) Return the product of the values for the requested axisDataFrame.quantile([q, axis, numeric_only, ...]) Return values at the given quantile over requested axis, a la numpy.percentile.DataFrame.rank([axis, method, numeric_only, ...]) Compute numerical data ranks (1 through n) along axis.DataFrame.round([decimals, out]) Round a DataFrame to a variable number of decimal places.DataFrame.sem([axis, skipna, level, ddof, ...]) Return unbiased standard error of the mean over requested axis.样本值的偏度(三阶矩)DataFrame.skew([axis, skipna, level, ...]) Return unbiased skew over requested axisDataFrame.sum([axis, skipna, level, ...]) Return the sum of the values for the requested axis标准差DataFrame.std([axis, skipna, level, ddof, ...]) Return sample standard deviation over requested axis.方差DataFrame.var([axis, skipna, level, ddof, ...]) Return unbiased variance over requested axis.'''>>> import pandas.io.data as web>>> all_data={}>>> for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')>>> price=DataFrame({tic:data['Adj Close'] for tic ,data in all_data.iteritems()})>>> volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})>>> returns=price.pct_change()>>> returns.tail() AAPL GOOG IBM MSFTDate 2009-12-24 0.034339 0.011117 0.004385 0.0025872009-12-28 0.012294 0.007098 0.013326 0.0054842009-12-29 -0.011861 -0.005571 -0.003477 0.0070582009-12-30 0.012147 0.005376 0.005461 -0.0136992009-12-31 -0.004300 -0.004416 -0.012597 -0.015504#计算相关系数>>> returns.IBM.corr(returns.GOOG)0.39068882087254675>>> returns.corrwith(returns.IBM)AAPL 0.410011GOOG 0.390689IBM 1.000000MSFT 0.495980dtype: float64>>> returns.corrwith(volume)AAPL -0.057549GOOG 0.062647IBM -0.007892MSFT -0.014245dtype: float64>>> obj=Series(['c','b','c','c','d','a','g','b'])>>> obj.value_counts()c 3b 2g 1d 1a 1dtype: int64>>> pd.value_counts(obj.values,sort=False)a 1c 3b 2d 1g 1dtype: int64#是否存在>>> mask=obj.isin(['b','c'])>>> mask0 True1 True2 True3 True4 False5 False6 False7 Truedtype: bool>>> obj[mask]0 c1 b2 c3 c7 bdtype: object#频度柱状图>>> data=DataFrame({'Qu1':[1,3,4,5,3],'Qu2':[2,4,1,2,4],'Qu3':[3,4,2,1,1]})>>> data Qu1 Qu2 Qu30 1 2 31 3 4 42 4 1 23 5 2 14 3 4 1>>> data.apply(pd.value_counts).fillna(0) Qu1 Qu2 Qu31 1.0 1.0 2.02 0.0 2.0 1.03 2.0 0.0 1.04 1.0 2.0 1.05 1.0 0.0 0.0#缺失数据处理>>> string_data=Series(['张三','李四',np.nan,'赵六'])>>> string_data0 张三1 李四2 NaN3 赵六dtype: object>>> string_data.isnull()0 False1 False2 True3 Falsedtype: bool######过滤数据过滤缺失数据>>> from numpy import nan as NA>>> data=Series([1,NA,3.5,NA,7])>>> data.dropna()0 1.02 3.54 7.0dtype: float64>>> data0 1.01 NaN2 3.53 NaN4 7.0dtype: float64>>> data[data.notnull()]0 1.02 3.54 7.0dtype: float64#DataFrame默认删除只要包含NA的行>>> data=DataFrame([[1.,6.5,3.],[1,NA,NA],[NA,NA,NA],[NA,6.5,3.]])>>> data 0 1 20 1.0 6.5 3.01 1.0 NaN NaN2 NaN NaN NaN3 NaN 6.5 3.0>>> data.dropna() 0 1 20 1.0 6.5 3.0#how='all'>>> data.dropna(how='all') 0 1 20 1.0 6.5 3.01 1.0 NaN NaN3 NaN 6.5 3.0#删除列全是null的>>> data 0 1 2 40 1.0 6.5 3.0 NaN1 1.0 NaN NaN NaN2 NaN NaN NaN NaN3 NaN 6.5 3.0 NaN>>> data.dropna(axis=1,how='all') 0 1 20 1.0 6.5 3.01 1.0 NaN NaN2 NaN NaN NaN3 NaN 6.5 3.0#thresh 表示空值的个数>>> df.dropna(thresh=3) 0 1 25 0.519277 1.182077 -0.5009186 -0.050867 -0.051302 1.368309#填充缺失数据>>> df.fillna(-1) 0 1 20 0.581403 -1.000000 -1.0000001 -1.709160 -1.000000 -1.0000002 2.496074 -1.000000 -1.0000003 0.329339 -1.000000 0.7362994 -0.638106 -1.000000 0.7560445 0.519277 1.182077 -0.5009186 -0.050867 -0.051302 1.368309#指定列的填充>>> df.fillna({1:0.5,3:-1}) 0 1 20 0.581403 0.500000 NaN1 -1.709160 0.500000 NaN2 2.496074 0.500000 NaN3 0.329339 0.500000 0.7362994 -0.638106 0.500000 0.7560445 0.519277 1.182077 -0.5009186 -0.050867 -0.051302 1.368309#修改原始对象 默认返回新对象>>> df.fillna({1:0.5,3:-1},inplace=True) 0 1 20 0.581403 0.500000 NaN1 -1.709160 0.500000 NaN2 2.496074 0.500000 NaN3 0.329339 0.500000 0.7362994 -0.638106 0.500000 0.7560445 0.519277 1.182077 -0.5009186 -0.050867 -0.051302 1.368309>>> df 0 1 20 0.581403 0.500000 NaN1 -1.709160 0.500000 NaN2 2.496074 0.500000 NaN3 0.329339 0.500000 0.7362994 -0.638106 0.500000 0.7560445 0.519277 1.182077 -0.5009186 -0.050867 -0.051302 1.368309>>> info=DataFrame(np.random.randn(6,3))>>> info.ix[:2,1]=NA;info.ix[4:,2]=NA>>> info 0 1 20 1.217480 NaN 0.4799811 -2.104463 NaN -2.9175392 -2.141440 NaN -1.3715743 0.925971 1.697813 0.8143474 -1.463290 -0.526497 NaN5 -0.300475 0.839098 NaN#可以限制行数>>> info.fillna(method='bfill',limit=1) 0 1 20 1.217480 NaN 0.4799811 -2.104463 NaN -2.9175392 -2.141440 1.697813 -1.3715743 0.925971 1.697813 0.8143474 -1.463290 -0.526497 NaN5 -0.300475 0.839098 NaN#层次索引>>> data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])>>> dataa 1 1.148945 2 -0.489120 3 1.151546b 1 0.840938 2 -1.992375 3 0.039002c 1 2.157531 2 0.963063d 2 0.130796 3 0.012320dtype: float64>>> data.indexMultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])>>> data['b']1 0.8409382 -1.9923753 0.039002dtype: float64>>> data['b':'c']b 1 0.840938 2 -1.992375 3 0.039002c 1 2.157531 2 0.963063dtype: float64>>> data.ix[['b','d']]b 1 0.840938 2 -1.992375 3 0.039002d 2 0.130796 3 0.012320dtype: float64>>> data[:,2]a -0.489120b -1.992375c 0.963063d 0.130796dtype: float64#转换成dataframe>>> data.unstack() 1 2 3a 1.148945 -0.489120 1.151546b 0.840938 -1.992375 0.039002c 2.157531 0.963063 NaNd NaN 0.130796 0.012320>>> data.unstack().stack()a 1 1.148945 2 -0.489120 3 1.151546b 1 0.840938 2 -1.992375 3 0.039002c 1 2.157531 2 0.963063d 2 0.130796 3 0.012320dtype: float64>>> frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['good','good','bad'],['G','R','G']])>>> frame good bad G R Ga 1 0 1 2 2 3 4 5b 1 6 7 8 2 9 10 11>>> frame.index.names=['key1','key2']>>> frame.columns.names=['s','c']>>> frames good badc G R Gkey1 key2 a 1 0 1 2 2 3 4 5b 1 6 7 8 2 9 10 11>>> frame['good']c G Rkey1 key2 a 1 0 1 2 3 4b 1 6 7 2 9 10#重排分级顺序>>> frame.swaplevel('key1','key2') good bad G R Gkey2 key1 1 a 0 1 22 a 3 4 51 b 6 7 82 b 9 10 11>>> frame.sortlevel(1)state good badcolor G R Gkey1 key2 a 1 0 1 2b 1 6 7 8a 2 3 4 5b 2 9 10 11>>> frame.swaplevel(0,1).sortlevel(0)state good badcolor G R Gkey2 key1 1 a 0 1 2 b 6 7 82 a 3 4 5 b 9 10 11#根据层次汇总>>> frame.sum(level='key2')state good badcolor G R Gkey2 1 6 8 102 12 14 16>>> frame.sum(level='color',axis=1)color G Rkey1 key2 a 1 2 1 2 8 4b 1 14 7 2 20 10#使用DataFrame的列>>> frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})>>> frame a b c d0 0 7 one 01 1 6 one 12 2 5 one 23 3 4 two 04 4 3 two 15 5 2 two 26 6 1 two 3>>> frame2=frame.set_index(['c','d'])>>> frame2 a bc d one 0 0 7 1 1 6 2 2 5two 0 3 4 1 4 3 2 5 2 3 6 1>>> frame2=frame.set_index(['c','d'],drop=False)>>> frame2 a b c dc d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3 ##############读取文件################>>> os.system('cat /Users/similarface/Downloads/jnn.csv')准考证号,姓名,班级,语文,数学,英语,化学,物理304040250124,罗茜,1,101,94,102.5,79,74304040250128,沈怡君,1,91.5,96,69,82,69304040250321,魏华,2,74,28,42,56,56304040250233,何仕林,2,60.5,42,34.5,49,46304040250725,屈妮,5,93.5,63,77.5,55,66304040250709,邓培蓓,5,102.5,81,47,65,58304040250805,郑清霞,5,89,80,63.5,63,65304040250827,明杨,6,108.5,92,79,89,83304040250819,李倩,6,93.5,61,44,45,32304040250912,江明悦,6,0,0,0,0,00>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',name>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv') 准考证号 姓名 班级 语文 数学 英语 化学 物理0 304040250124 罗茜 1 101.0 94 102.5 79 741 304040250128 沈怡君 1 91.5 96 69.0 82 692 304040250321 魏华 2 74.0 28 42.0 56 563 304040250233 何仕林 2 60.5 42 34.5 49 464 304040250725 屈妮 5 93.5 63 77.5 55 665 304040250709 邓培蓓 5 102.5 81 47.0 65 586 304040250805 郑清霞 5 89.0 80 63.5 63 657 304040250827 明杨 6 108.5 92 79.0 89 838 304040250819 李倩 6 93.5 61 44.0 45 329 304040250912 江明悦 6 0.0 0 0.0 0 0>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号') 姓名 班级 语文 数学 英语 化学 物理准考证号 304040250124 罗茜 1 101.0 94 102.5 79 74304040250128 沈怡君 1 91.5 96 69.0 82 69304040250321 魏华 2 74.0 28 42.0 56 56304040250233 何仕林 2 60.5 42 34.5 49 46304040250725 屈妮 5 93.5 63 77.5 55 66304040250709 邓培蓓 5 102.5 81 47.0 65 58304040250805 郑清霞 5 89.0 80 63.5 63 65304040250827 明杨 6 108.5 92 79.0 89 83304040250819 李倩 6 93.5 61 44.0 45 32304040250912 江明悦 6 0.0 0 0.0 0 0#数量不定的空白符分割>>> result=pd.read_table('ext3.txt',sep='\s+')#忽略的行数>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号',skiprows=[5,9]) 姓名 班级 语文 数学 英语 化学 物理准考证号 304040250124 罗茜 1 101.0 94 102.5 79 74304040250128 沈怡君 1 91.5 96 69.0 82 69304040250321 魏华 2 74.0 28 42.0 56 56304040250233 何仕林 2 60.5 42 34.5 49 46304040250709 邓培蓓 5 102.5 81 47.0 65 58304040250805 郑清霞 5 89.0 80 63.5 63 65304040250827 明杨 6 108.5 92 79.0 89 83304040250912 江明悦 6 0.0 0 0.0 0 0#缺失值的填充NA -1.#IND NULL>>> os.system('cat /Users/similarface/Downloads/ex5.csv')something,a,b,c,d,messageone,1,2,IND,4,NAtow,-1,-1,,8,worldthree,.,10,11,NULL,foo>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['NULL']) something a b c d message0 one 1 2 IND 4.0 NaN1 tow -1 -1 NaN 8.0 world2 three . 10 11 NaN foo#指定空值>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['-1']) something a b c d message0 one 1 2.0 IND 4.0 NaN1 tow NaN NaN NaN 8.0 world2 three . 10.0 11 NaN foo>>> sentinels={'message':['foo','NA'],'something':['tow']}>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=sentinels) something a b c d message0 one 1 2 IND 4.0 NaN1 NaN -1 -1 NaN 8.0 world2 three . 10 11 NaN NaN'''filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO)The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csvsep : str, default ‘,’Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted and will force use of the python parsing engine and will ignore quotes in the data.delimiter : str, default NoneAlternative argument name for sep.header : int or list of ints, default ‘infer’Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no names passed, otherwise None. Explicitly pass header=0 to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True, so header=0 denotes the first line of data rather than the first line of the file.names : array-like, default NoneList of column names to use. If file contains no header row, then you should explicitly pass header=Noneindex_col : int or sequence or False, default NoneColumn to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names)usecols : array-like, default NoneReturn a subset of the columns. Results in much faster parsing time and lower memory usage.squeeze : boolean, default FalseIf the parsed data only contains one column then return a Seriesprefix : str, default NonePrefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, ...mangle_dupe_cols : boolean, default TrueDuplicate columns will be specified as ‘X.0’...’X.N’, rather than ‘X’...’X’dtype : Type name or dict of column -> type, default NoneData type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} (Unsupported with engine=’python’). Use str or object to preserve and not interpret dtype.engine : {‘c’, ‘python’}, optionalParser engine to use. The C engine is faster while the python engine is currently more feature-complete.converters : dict, default NoneDict of functions for converting values in certain columns. Keys can either be integers or column labelstrue_values : list, default NoneValues to consider as Truefalse_values : list, default NoneValues to consider as Falseskipinitialspace : boolean, default FalseSkip spaces after delimiter.skiprows : list-like or integer, default NoneLine numbers to skip (0-indexed) or number of lines to skip (int) at the start of the fileskipfooter : int, default 0Number of lines at bottom of file to skip (Unsupported with engine=’c’)nrows : int, default NoneNumber of rows of file to read. Useful for reading pieces of large filesna_values : str or list-like or dict, default NoneAdditional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’.keep_default_na : bool, default TrueIf na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they’re appended to.na_filter : boolean, default TrueDetect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large fileverbose : boolean, default FalseIndicate number of NA values placed in non-numeric columnsskip_blank_lines : boolean, default TrueIf True, skip over blank lines rather than interpreting as NaN valuesparse_dates : boolean or list of ints or names or list of lists or dict, default Falseboolean. If True -> try parsing the index.list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse asa single date column.dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’Note: A fast-path exists for iso8601-formatted dates.infer_datetime_format : boolean, default FalseIf True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processingkeep_date_col : boolean, default FalseIf True and parse_dates specifies combining multiple columns then keep the original columns.date_parser : function, default NoneFunction to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. Pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments.dayfirst : boolean, default FalseDD/MM format dates, international and European formatiterator : boolean, default FalseReturn TextFileReader object for iteration or getting chunks with get_chunk().chunksize : int, default NoneReturn TextFileReader object for iteration. See IO Tools docs for more information on iterator and chunksize.compression : {‘infer’, ‘gzip’, ‘bz2’, None}, default ‘infer’For on-the-fly decompression of on-disk data. If ‘infer’, then use gzip or bz2 if filepath_or_buffer is a string ending in ‘.gz’ or ‘.bz2’, respectively, and no decompression otherwise. Set to None for no decompression.thousands : str, default NoneThousands separatordecimal : str, default ‘.’Character to recognize as decimal point (e.g. use ‘,’ for European data).lineterminator : str (length 1), default NoneCharacter to break file into lines. Only valid with C parser.quotechar : str (length 1), optionalThe character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.quoting : int or csv.QUOTE_* instance, default NoneControl field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior.escapechar : str (length 1), default NoneOne-character string used to escape delimiter when quoting is QUOTE_NONE.comment : str, default NoneIndicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as skip_blank_lines=True), fully commented lines are ignored by the parameter header but not by skiprows. For example, if comment=’#’, parsing ‘#emptyna,b,cn1,2,3’ with header=0 will result in ‘a,b,c’ being treated as the header.encoding : str, default NoneEncoding to use for UTF when reading/writing (ex. ‘utf-8’). List of Python standard encodingsdialect : str or csv.Dialect instance, default NoneIf None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more detailstupleize_cols : boolean, default FalseLeave a list of tuples on columns as is (default is to convert to a Multi Index on the columns)error_bad_lines : boolean, default TrueLines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)warn_bad_lines : boolean, default TrueIf error_bad_lines is False, and warn_bad_lines is True, a warning for each “bad line” will be output. (Only valid with C parser).'''#数据写入data.to_csv('文件名/sys.stout',sep='|',index=True/False,headers=TRUE/FALSE,cols=[选取的列])#数据库操作import pandas as pdfrom pandas import *import sqlite3query="""create table test(a varchar(20),b VARCHAR(20),c REAL ,d INTEGER);"""con=sqlite3.connect(':memory')con.execute(query)con.commit()data=[('Atlanta','Georgia',1.25,6), ('Tallahassee','Florida',2.6,3), ('Sacramento','California',1.7,5) ]stmt="INSERT INTO test VALUES (?,?,?,?)"con.executemany(stmt,data)con.commit()cursor=con.execute('select * from test')rows=cursor.fetchall()DataFrame(rows,columns=zip(*cursor.description)[0])#直接写sql读取dataFrameimport pandas.io.sql as sqlsql.read_sql('select * from test',con)#合并数据集>>> df1 = DataFrame(... {'key': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],... 'major0': ['计算机','生物','化学','物理','医学']... })>>> df2 = DataFrame(... {'key': ['北京大学', '四川大学', '云南大学'],... 'major1': ['外国语', '口腔', '旅游']... })>>> df1 key major00 北京大学 计算机1 四川大学 生物2 天津大学 化学3 山东大学 物理4 清华大学 医学>>> df2 key major10 北京大学 外国语1 四川大学 口腔2 云南大学 旅游>>> pd.merge(df1,df2) key major0 major10 北京大学 计算机 外国语1 四川大学 生物 口腔>>> df3 = DataFrame(... {'lkey': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],... 'major0': ['计算机','生物','化学','物理','医学']... })>>> df4 = DataFrame(... {'rkey': ['北京大学', '四川大学', '云南大学'],... 'major1': ['外国语', '口腔', '旅游']... })>>> df3 lkey major00 北京大学 计算机1 四川大学 生物2 天津大学 化学3 山东大学 物理4 清华大学 医学>>> df4 major1 rkey0 外国语 北京大学1 口腔 四川大学2 旅游 云南大学>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey') lkey major0 major1 rkey0 北京大学 计算机 外国语 北京大学1 四川大学 生物 口腔 四川大学#外连接>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer') lkey major0 major1 rkey0 北京大学 计算机 外国语 北京大学1 四川大学 生物 口腔 四川大学2 天津大学 化学 NaN NaN3 山东大学 物理 NaN NaN4 清华大学 医学 NaN NaN5 NaN NaN 旅游 云南大学#左连接>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left') lkey major0 major1 rkey0 北京大学 计算机 外国语 北京大学1 四川大学 生物 口腔 四川大学2 天津大学 化学 NaN NaN3 山东大学 物理 NaN NaN4 清华大学 医学 NaN NaN#右连接>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right') lkey major0 major1 rkey0 北京大学 计算机 外国语 北京大学1 四川大学 生物 口腔 四川大学2 NaN NaN 旅游 云南大学#内连接>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='inner') lkey major0 major1 rkey0 北京大学 计算机 外国语 北京大学1 四川大学 生物 口腔 四川大学#多个键进行合并left=DataFrame({ 'key1':['foo','foo','bar'], 'key2':['one','two','one'], 'lval':[1,2,3]})right=DataFrame({ 'key1':['foo','foo','bar','bar'], 'key2':['one','one','one','two'], 'lval':[4,5,6,7]})>>> pd.merge(left,right,on=['key1','key2'],how='outer') key1 key2 lval_x lval_y0 foo one 1.0 4.01 foo one 1.0 5.02 foo two 2.0 NaN3 bar one 3.0 6.04 bar two NaN 7.0#重复列名的处理>>> pd.merge(left,right,on='key1',suffixes=('_lef','_right')) key1 key2_lef lval_lef key2_right lval_right0 foo one 1 one 41 foo one 1 one 52 foo two 2 one 43 foo two 2 one 54 bar one 3 one 65 bar one 3 two 7#索引上的合并>>> right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])>>> left1=DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})#合并根据索引对比>>> pd.merge(left1,right1,left_on='key',right_index=True) key value group_val0 a 0 3.52 a 2 3.53 a 3 3.51 b 1 7.04 b 4 7.0lefth=DataFrame( {'key1':['similar','similar','similar','face','face'], 'key2':[2000,2001,2002,2001,2002], 'data':np.arange(5.) })righth=DataFrame(np.arange(12).reshape((6,2)), index=[['face','face','similar','similar','similar','similar'], [2001,2000,2000,2000,2001,2002] ], columns=['event1','event2'] )>>> lefth data key1 key20 0.0 similar 20001 1.0 similar 20012 2.0 similar 20023 3.0 face 20014 4.0 face 2002>>> righth event1 event2face 2001 0 1 2000 2 3similar 2000 4 5 2000 6 7 2001 8 9 2002 10 11>>> pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True) data key1 key2 event1 event20 0.0 similar 2000 4 50 0.0 similar 2000 6 71 1.0 similar 2001 8 92 2.0 similar 2002 10 113 3.0 face 2001 0 1>>> left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['similar','face'])>>> left2 similar facea 1.0 2.0c 3.0 4.0e 5.0 6.0>>> right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['M','A'])>>> right2 M Ab 7.0 8.0c 9.0 10.0d 11.0 12.0e 13.0 14.0>>> pd.merge(left2,right2,how='outer',left_index=True,right_index=True) similar face M Aa 1.0 2.0 NaN NaNb NaN NaN 7.0 8.0c 3.0 4.0 9.0 10.0d NaN NaN 11.0 12.0e 5.0 6.0 13.0 14.0>>> left2.join(right2,how='outer') similar face M Aa 1.0 2.0 NaN NaNb NaN NaN 7.0 8.0c 3.0 4.0 9.0 10.0d NaN NaN 11.0 12.0e 5.0 6.0 13.0 14.0>>> another=DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['NK','O'])>>> left2.join([right2,another]) similar face M A NK Oa 1.0 2.0 NaN NaN 7 8c 3.0 4.0 9.0 10.0 9 10e 5.0 6.0 13.0 14.0 11 12#轴向连接>>> arr=np.arange(12).reshape((3,4))>>> arrarray([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])>>> np.concatenate([arr,arr],axis=1)array([[ 0, 1, 2, 3, 0, 1, 2, 3], [ 4, 5, 6, 7, 4, 5, 6, 7], [ 8, 9, 10, 11, 8, 9, 10, 11]])>>> s1=Series([0,1],index=['a','b'])>>> s2=Series([2,3,4],index=['c','d','e'])>>> s3=Series([5,6],index=['f','g'])>>> s1a 0b 1dtype: int64>>> s2c 2d 3e 4dtype: int64>>> s3f 5g 6dtype: int64>>> pd.concat([s1,s2,s3])a 0b 1c 2d 3e 4f 5g 6dtype: int64>>> pd.concat([s1,s2,s3,s1])a 0b 1c 2d 3e 4f 5g 6a 0b 1dtype: int64>>> pd.concat([s1,s2,s3,s1],axis=1) 0 1 2 3a 0.0 NaN NaN 0.0b 1.0 NaN NaN 1.0c NaN 2.0 NaN NaNd NaN 3.0 NaN NaNe NaN 4.0 NaN NaNf NaN NaN 5.0 NaNg NaN NaN 6.0 NaNdf1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])>>> pd.concat([df1,df2],axis=1,keys=['level1','level2']) level1 level2 one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0>>> pd.concat({'level1':df1,'level2':df2},axis=1) level1 level2 one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0>>> pd.concat([df1,df2],axis=1,keys=['L1','L2'],names=['u','l'])u L1 L2 l one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0>>> df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])>>> df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])>>> df1 a b c d0 -1.487358 0.077565 0.209403 -0.7125071 1.990047 -0.221415 1.381161 -0.8768112 -0.153150 0.391847 1.180728 -0.972548>>> df2 b d a0 -0.200611 0.321759 -0.2016201 -1.842735 -1.924933 0.281712>>> pd.concat([df1,df2]) a b c d0 -1.487358 0.077565 0.209403 -0.7125071 1.990047 -0.221415 1.381161 -0.8768112 -0.153150 0.391847 1.180728 -0.9725480 -0.201620 -0.200611 NaN 0.3217591 0.281712 -1.842735 NaN -1.924933>>> pd.concat([df1,df2],ignore_index=True) a b c d0 -1.487358 0.077565 0.209403 -0.7125071 1.990047 -0.221415 1.381161 -0.8768112 -0.153150 0.391847 1.180728 -0.9725483 -0.201620 -0.200611 NaN 0.3217594 0.281712 -1.842735 NaN -1.924933>>> pd.concat([df1,df2],ignore_index=True,axis=1) 0 1 2 3 4 5 60 -1.487358 0.077565 0.209403 -0.712507 -0.200611 0.321759 -0.2016201 1.990047 -0.221415 1.381161 -0.876811 -1.842735 -1.924933 0.2817122 -0.153150 0.391847 1.180728 -0.972548 NaN NaN NaN>>> b[:-2]f 0.0e 1.0d 2.0c 3.0dtype: float64>>> a[2:]d NaNc 3.5b 4.5a NaNdtype: float64>>> b[:-2].combine_first(a[2:])a NaNb 4.5c 3.0d 2.0e 1.0f 0.0dtype: float64>>> df1=DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)})>>> df2=DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})>>> df2 a b0 5.0 NaN1 4.0 3.02 NaN 4.03 3.0 6.04 7.0 8.0>>> df1 a b c0 1.0 NaN 21 NaN 2.0 62 5.0 NaN 103 NaN 6.0 14>>> df1.combine_first(df2) a b c0 1.0 NaN 2.01 4.0 2.0 6.02 5.0 4.0 10.03 3.0 6.0 14.04 7.0 8.0 NaN#重塑和轴向旋转>>> data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['similar','face'],name='state'),columns=pd.Index(['one','two','three'],name='number'))>>> datanumber one two threestate similar 0 1 2face 3 4 5>>> data.stack()state numbersimilar one 0 two 1 three 2face one 3 two 4 three 5dtype: int64>>> data.stack().unstack()number one two threestate similar 0 1 2face 3 4 5>>> data.stack().unstack(0)state similar facenumber one 0 3two 1 4three 2 5>>> data.stack().unstack('state')state similar facenumber one 0 3two 1 4three 2 5>>> s1=Series([0,1,2,3],index=['a','b','c','d'])>>> s2=Series([4,5,6],index=['c','d','e'])>>> s1a 0b 1c 2d 3dtype: int64>>> s2c 4d 5e 6dtype: int64>>> pd.concat([s1,s2],keys=['one','two'])one a 0 b 1 c 2 d 3two c 4 d 5 e 6dtype: int64>>> pd.concat([s1,s2],keys=['one','two']).unstack() a b c d eone 0.0 1.0 2.0 3.0 NaNtwo NaN NaN 4.0 5.0 6.0>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack()one a 0.0 b 1.0 c 2.0 d 3.0two c 4.0 d 5.0 e 6.0dtype: float64>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack(dropna=False)one a 0.0 b 1.0 c 2.0 d 3.0 e NaNtwo a NaN b NaN c 4.0 d 5.0 e 6.0dtype: float64#利用函数进行数据转换data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}data['animal'] = data['food'].map(str.lower).map(meat_to_animal)>>> data food ounces animal0 bacon 4.0 pig1 pulled pork 3.0 pig2 bacon 12.0 pig3 Pastrami 6.0 cow4 corned beef 7.5 cow5 Bacon 8.0 pig6 pastrami 3.0 cow7 honey ham 5.0 pig8 nova lox 6.0 salmon>>> data['food'].map(lambda x: meat_to_animal[x.lower()])0 pig1 pig2 pig3 cow4 cow5 pig6 cow7 pig8 salmonName: food, dtype: object离散化和面元划分:#指定组名称>>> group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']>>> pd.cut(ages,bins,labels=group_names)[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]Length: 12Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]#等长面元 将下面的随机数分4段 precision小数点位数>>> data=np.random.rand(20)>>> dataarray([ 0.42519089, 0.18981873, 0.29726754, 0.37843724, 0.31072184, 0.20240683, 0.99244468, 0.61880299, 0.9948212 , 0.32893834, 0.87701908, 0.25638677, 0.02344737, 0.15162624, 0.31874342, 0.16534997, 0.43495775, 0.83059911, 0.57975644, 0.53763544])>>> pd.cut(data,4,precision=2)[(0.27, 0.51], (0.022, 0.27], (0.27, 0.51], (0.27, 0.51], (0.27, 0.51], ..., (0.022, 0.27], (0.27, 0.51], (0.75, 0.99], (0.51, 0.75], (0.51, 0.75]]Length: 20Categories (4, object): [(0.022, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]]#分段求值>>> pd.value_counts(cats)(18, 25] 5(35, 60] 3(25, 35] 3(60, 100] 1dtype: int64#左闭右开pd.cut(ages, [18, 26, 36, 61, 100], right=False)#检查、过滤异常值>>> np.random.seed(12345)>>> data=DataFrame(np.random.randn(1000,4))>>> data.describe() 0 1 2 3count 1000.000000 1000.000000 1000.000000 1000.000000mean -0.067684 0.067924 0.025598 -0.002298std 0.998035 0.992106 1.006835 0.996794min -3.428254 -3.548824 -3.184377 -3.74535625% -0.774890 -0.591841 -0.641675 -0.64414450% -0.116401 0.101143 0.002073 -0.01361175% 0.616366 0.780282 0.680391 0.654328max 3.366626 2.653656 3.260383 3.927528>>> col=data[3]>>> col[np.abs(col)>3]97 3.927528305 -3.399312400 -3.745356Name: 3, dtype: float64#随机重排序>>> sampler=np.random.permutation(5)>>> df.take(sampler) 0 1 2 34 16 17 18 192 8 9 10 111 4 5 6 73 12 13 14 150 0 1 2 3>>> df.take(np.random.permutation(len(df))[:3]) 0 1 2 31 4 5 6 72 8 9 10 110 0 1 2 3#给定数组的值生成大集合>>> bag=np.array([5,7,-1,6,4])>>> sampler=np.random.randint(0,len(bag),size=10)>>> samplerarray([1, 0, 4, 1, 2, 1, 4, 4, 3, 4])>>> draws=bag.take(sampler)>>> drawsarray([ 7, 5, 4, 7, -1, 7, 4, 4, 6, 4])#哑变量矩阵 和 指标矩阵 @某一列出现与否的矩阵>>> df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})>>> df data1 key0 0 b1 1 b2 2 a3 3 c4 4 a5 5 b>>> pd.get_dummies(df['key']) a b c0 0.0 1.0 0.01 0.0 1.0 0.02 1.0 0.0 0.03 0.0 0.0 1.04 1.0 0.0 0.05 0.0 1.0 0.0#>>> dummies=pd.get_dummies(df['key'],prefix='key')>>> dummies key_a key_b key_c0 0.0 1.0 0.01 0.0 1.0 0.02 1.0 0.0 0.03 0.0 0.0 1.04 1.0 0.0 0.05 0.0 1.0 0.0>>> df_with_dummy=df[['data1']].join(dummies)>>> df_with_dummy data1 key_a key_b key_c0 0 0.0 1.0 0.01 1 0.0 1.0 0.02 2 1.0 0.0 0.03 3 0.0 0.0 1.04 4 1.0 0.0 0.05 5 0.0 1.0 0.0>>> valuesarray([ 0.86789062, 0.4187927 , 0.48191735, 0.44540277, 0.6855452 , 0.33193716, 0.20772778, 0.21461227, 0.50985294, 0.95327048])>>> >>> bins=[0,0.2,0.4,0.6,0.8,1]>>> pd.get_dummies(pd.cut(values,bins)) (0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1]0 0.0 0.0 0.0 0.0 1.01 0.0 0.0 1.0 0.0 0.02 0.0 0.0 1.0 0.0 0.03 0.0 0.0 1.0 0.0 0.04 0.0 0.0 0.0 1.0 0.05 0.0 1.0 0.0 0.0 0.06 0.0 1.0 0.0 0.0 0.07 0.0 1.0 0.0 0.0 0.08 0.0 0.0 1.0 0.0 0.09 0.0 0.0 0.0 0.0 1.0#电子邮件正则>>> pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'>>> regex=re.compile(pattern,flags=re.IGNORECASE)>>> regex.match('jaflfbs@sina.com')<_sre.SRE_Match object at 0x111ceab78>>>> m=regex.match('jaflfbs@sina.com')>>> m.groups()('jaflfbs', 'sina', 'com')#分组 group by groupby>>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','tow','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})>>> df data1 data2 key1 key20 -0.893905 0.311668 a one1 1.274761 0.885820 a two2 1.115914 0.887069 b one3 0.054165 0.267643 b tow4 -0.819516 0.933495 a one>>> grouped=df['data1'].groupby(df['key1'])>>> grouped
>>> grouped.mean()key1a -0.14622b 0.58504Name: data1, dtype: float64>>> means=df['data1'].groupby([df['key1'],df['key2']]).mean()>>> meanskey1 key2a one -0.856710 two 1.274761b one 1.115914 tow 0.054165Name: data1, dtype: float64>>> means.unstack()key2 one tow twokey1 a -0.856710 NaN 1.274761b 1.115914 0.054165 NaN#可以具体制定 分组的列>>> states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])>>> years = np.array([2005, 2005, 2006, 2005, 2006])>>> df['data1'].groupby([states,years]).mean()#分组的可以是列名 key2没有出现 因为key2不是数值类型的>>> df.groupby('key1').mean() data1 data2key1 a -0.14622 0.710328b 0.58504 0.577356>>> df.groupby(['key1','key2']).mean() data1 data2key1 key2 a one -0.856710 0.622582 two 1.274761 0.885820b one 1.115914 0.887069 tow 0.054165 0.267643#获取分组的大小>>> df.groupby(['key1','key2']).size()key1 key2a one 2 two 1b one 1 tow 1#>>> pieces=dict(list(df.groupby('key1')))>>> pieces['b'] data1 data2 key1 key22 1.115914 0.887069 b one3 0.054165 0.267643 b tow############时间操作>>> from datetime import datetime>>> now=datetime.now()>>> nowdatetime.datetime(2016, 4, 12, 14, 31, 50, 995484)>>> now.year,now.month,now.day(2016, 4, 12)>>> now.day12>>> #delta以毫秒形式存储日期和时间 datetime.timedelta表示lia>>> delta=datetime(2016,5,1)-datetime(2016,5,2)>>> deltadatetime.timedelta(-1)>>> delta.days-1>>> delta.seconds0>>> from datetime import timedelta>>> start=datetime(2011,1,1)>>> start+timedelta(12)datetime.datetime(2011, 1, 13, 0, 0)>>> start-2*timedelta(12)datetime.datetime(2010, 12, 8, 0, 0)>>> stamp=datetime(2011,1,3)>>> str(stamp)'2011-01-03 00:00:00'>>> value='2016-01-01'>>> datetime.strptime(value,'%Y-%m-%d')datetime.datetime(2016, 1, 1, 0, 0)>>> value='2016-01-13'>>> datetime.strptime(value,'%Y-%m-%d')datetime.datetime(2016, 1, 13, 0, 0)>>> value='2016-13-13'>>> datetime.strptime(value,'%Y-%m-%d')Traceback (most recent call last): File "
", line 1, in
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_strptime.py", line 325, in _strptime (data_string, format))ValueError: time data '2016-13-13' does not match format '%Y-%m-%d>>> datestrs=['7/6/2016','1/1/1111']>>> [datetime.strptime(x,'%m/%d/%Y') for x in datestrs][datetime.datetime(2016, 7, 6, 0, 0), datetime.datetime(1111, 1, 1, 0, 0)]>>> from dateutil.parser import parse>>> parse('2016-01-09')datetime.datetime(2016, 1, 9, 0, 0)>>> parse('Jan 31,2015 10:31 PM')datetime.datetime(2015, 1, 31, 22, 31)>>> parse('1/3/2018',dayfirst=True)datetime.datetime(2018, 3, 1, 0, 0)>>> parse('1/3/2018',dayfirst=False)datetime.datetime(2018, 1, 3, 0, 0)>>> datestrs=['1/4/2016','4/1/2017']>>> pd.to_datetime(datestrs)DatetimeIndex(['2016-01-04', '2017-04-01'], dtype='datetime64[ns]', freq=None)>>> idx=pd.to_datetime(datestrs+[None])>>> idxDatetimeIndex(['2016-01-04', '2017-04-01', 'NaT'], dtype='datetime64[ns]', freq=None)>>> pd.isnull(idx)array([False, False, True], dtype=bool)>>> dates=[datetime(2011,1,2),datetime(2016,1,1),datetime(2016,1,2),datetime(2016,1,3),datetime(2016,1,4),datetime(2016,1,5)]>>> dates[datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)]>>> from pandas import *>>> ts=Series(np.random.randn(6),index=dates)>>> ts2011-01-02 0.7340182016-01-01 1.6615902016-01-02 0.8395042016-01-03 -1.2958342016-01-04 0.1905452016-01-05 0.267724dtype: float64>>> ts+ts[::2]2011-01-02 1.4680372016-01-01 NaN2016-01-02 1.6790082016-01-03 NaN2016-01-04 0.3810912016-01-05 NaNdtype: float64>>> ts.index.dtypedtype('
>> stamp=ts.index[0]>>> stampTimestamp('2011-01-02 00:00:00')>>> stamp=ts.index[2]>>> ts[stamp]0.83950398236998658>>> ts['1/1/2016']1.6615901161098698>>> longer_ts=Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))>>> longer_ts['2002-09-21':'2002-09-23']2002-09-21 -0.1058982002-09-22 1.7083422002-09-23 -0.815799Freq: D, dtype: float64>>> longer_ts['2002-09-21':'09/23/2002']2002-09-21 -0.1058982002-09-22 1.7083422002-09-23 -0.815799Freq: D, dtype: float64>>> longer_ts['2002-09-21':'23/09/2002']2002-09-21 -0.1058982002-09-22 1.7083422002-09-23 -0.815799Freq: D, dtype: float64>>> longer_ts.truncate(before='2002-09-23')2002-09-23 -0.8157992002-09-24 -0.1408922002-09-25 -0.3975912002-09-26 0.451815Freq: D, dtype: float64>>> longer_ts.truncate(after='2002-09-23')#重复时间序列>>> dates=pd.DatetimeIndex(['1/1/2016','1/2/2016','1/2/2016','1/2/2016','1/3/2016'])>>> datesDatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-02', '2016-01-02', '2016-01-03'], dtype='datetime64[ns]', freq=None)>>> dup_ts=Series(range(5),index=dates)>>> dup_ts2016-01-01 02016-01-02 12016-01-02 22016-01-02 32016-01-03 4dtype: int64>>> dup_ts.index.is_uniqueFalse>>> dup_ts[] File "
", line 1 dup_ts[] ^SyntaxError: invalid syntax>>> dup_ts['1/2/2016']2016-01-02 12016-01-02 22016-01-02 3dtype: int64>>> grouped=dup_ts.groupby(level=0)>>> grouped.mean()2016-01-01 02016-01-02 22016-01-03 4dtype: int64>>> grouped.max()2016-01-01 02016-01-02 32016-01-03 4dtype: int64>>> grouped.count()2016-01-01 12016-01-02 32016-01-03 1dtype: int64#4-6月的日期>>> index=pd.date_range('4/1/2016','6/1/2016')#开始 向后多少天>>> pd.date_range(start='4/1/2016',periods=20)DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04', '2016-04-05', '2016-04-06', '2016-04-07', '2016-04-08', '2016-04-09', '2016-04-10', '2016-04-11', '2016-04-12', '2016-04-13', '2016-04-14', '2016-04-15', '2016-04-16', '2016-04-17', '2016-04-18', '2016-04-19', '2016-04-20'], dtype='datetime64[ns]', freq='D')>>> pd.date_range(end='2016-12-12',periods=10)DatetimeIndex(['2016-12-03', '2016-12-04', '2016-12-05', '2016-12-06', '2016-12-07', '2016-12-08', '2016-12-09', '2016-12-10', '2016-12-11', '2016-12-12'], dtype='datetime64[ns]', freq='D')>>> pd.date_range('1/1/2016','12/2/2016',freq='BM')DatetimeIndex(['2016-01-29', '2016-02-29', '2016-03-31', '2016-04-29', '2016-05-31', '2016-06-30', '2016-07-29', '2016-08-31', '2016-09-30', '2016-10-31', '2016-11-30'], dtype='datetime64[ns]', freq='BM')>>> pd.date_range('5/2/2012 12:12:12',periods=5)DatetimeIndex(['2012-05-02 12:12:12', '2012-05-03 12:12:12', '2012-05-04 12:12:12', '2012-05-05 12:12:12', '2012-05-06 12:12:12'], dtype='datetime64[ns]', freq='D')#normalize 午夜12点>>> pd.date_range('5/2/2016 12:13:14',periods=5,normalize=True)DatetimeIndex(['2016-05-02', '2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06'], dtype='datetime64[ns]', freq='D')>>> from pandas.tseries.offsets import Hour,Minute>>> hour=Hour>>> hour
>>> four_hours=Hour(4)>>> four_hours<4 * Hours>>>> >>> pd.date_range('1/1/2016','1/2/2016',freq='4h')DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 04:00:00', '2016-01-01 08:00:00', '2016-01-01 12:00:00', '2016-01-01 16:00:00', '2016-01-01 20:00:00', '2016-01-02 00:00:00'], dtype='datetime64[ns]', freq='4H')>>> pd.date_range('1/1/2000',periods=2,freq='1h30min')DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00'], dtype='datetime64[ns]', freq='90T')freq-----------------------------http://pandas.pydata.org/pandas-docs/version/0.18.0/timeseries.html#dateoffset-objects-----------------------------D 每日B 工作日H 小数T 分钟S 秒L 毫秒U 微妙M 每月最后一天BM 每月最后一个工作日MS 每月第一个BMS 每月工作第一天W-MON W-TUE[WED THU FRI SAT SUN]WOM-1MON WOM-2MON 每月第一个星期一 。。。Q-JAN 月份 JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DECBQ-JAN AS-JAN 每年指定月份的第一个日历日BAS-JAN BAS-FEB 每年指定月份的第一个工作日>>> rng=pd.date_range('1/1/2016','9/1/2012',freq='WOM-3FRI')>>> rngDatetimeIndex([], dtype='datetime64[ns]', freq='WOM-3FRI')>>> rng=pd.date_range('1/1/2016','9/1/2016',freq='WOM-3FRI')>>> rngDatetimeIndex(['2016-01-15', '2016-02-19', '2016-03-18', '2016-04-15', '2016-05-20', '2016-06-17', '2016-07-15', '2016-08-19'], dtype='datetime64[ns]', freq='WOM-3FRI')>>> ts=Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M'))>>> ts2000-01-31 0.2462542000-02-29 0.4263852000-03-31 0.8329712000-04-30 1.163773Freq: M, dtype: float64>>> ts.shift(2)2000-01-31 NaN2000-02-29 NaN2000-03-31 0.2462542000-04-30 0.426385Freq: M, dtype: float64>>> ts.shift(-2)2000-01-31 0.8329712000-02-29 1.1637732000-03-31 NaN2000-04-30 NaNFreq: M, dtype: float64#计算百分比变化>>> ts/ts.shift(1)-12000-01-31 NaN2000-02-29 0.7314862000-03-31 0.9535642000-04-30 0.397135Freq: M, dtype: float64>>> ts.shift(2,freq='M')2000-03-31 0.2462542000-04-30 0.4263852000-05-31 0.8329712000-06-30 1.163773Freq: M, dtype: float64>>> ts.shift(3,freq='D')2000-02-03 0.2462542000-03-03 0.4263852000-04-03 0.8329712000-05-03 1.163773dtype: float64>>> ts.shift(1,freq='3D')2000-02-03 0.2462542000-03-03 0.4263852000-04-03 0.8329712000-05-03 1.163773dtype: float64>>> ts.shift(1,freq='90T')2000-01-31 01:30:00 0.2462542000-02-29 01:30:00 0.4263852000-03-31 01:30:00 0.8329712000-04-30 01:30:00 1.163773Freq: M, dtype: float64>>> from pandas.tseries.offsets import Day,MonthEnd>>> now=datetime(2011,11,17)>>> nowdatetime.datetime(2011, 11, 17, 0, 0)>>> now+3*Day()Timestamp('2011-11-20 00:00:00')>>> now+MonthEnd()Timestamp('2011-11-30 00:00:00')>>> now+MonthEnd(2)Timestamp('2011-12-31 00:00:00')>>> offset=MonthEnd()>>> offset.rollforward(now)Timestamp('2011-11-30 00:00:00')>>> nowdatetime.datetime(2011, 11, 17, 0, 0)>>> offset.rollback(now)Timestamp('2011-10-31 00:00:00')>>> ts=Series(np.random.randn(20),index=pd.date_range('1/12/2016',periods=20,freq='4d'))5450>>>>> ts.groupby(offset.rollforward).mean()2016-01-31 -0.0235152016-02-29 0.3324122016-03-31 0.445600dtype: float64>>> ts.resample('M',how='mean')2016-01-31 0.7052082016-02-29 -0.1744442016-03-31 0.534282Freq: M, dtype: float64#时间算术运算>>> p=pd.Period(2016,freq='A-DEC')>>> pPeriod('2016', 'A-DEC')>>> p+5Period('2021', 'A-DEC')>>> p-2Period('2014', 'A-DEC')>>> pd.Period('2014',freq='A-DEC')-p-2>>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')>>> rngPeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06'], dtype='int64', freq='M')>>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')>>> Series(np.random.randn(6),index=rng)2016-01 -0.7396932016-02 -0.9286672016-03 0.1763482016-04 1.3439802016-05 -1.5138162016-06 0.654137Freq: M, dtype: float64>>> values=['2010Q3','2012Q2','2013Q1']>>> index=pd.PeriodIndex(values,freq='Q-DEC')>>> indexPeriodIndex(['2010Q3', '2012Q2', '2013Q1'], dtype='int64', freq='Q-DEC')#时间频度转换>>> p=pd.Period('2007',freq='A-DEC')>>> p.asfreq('M',how='start')Period('2007-01', 'M')>>> p.asfreq('M',how='end')Period('2007-12', 'M')>>> p=pd.Period('2007',freq='A-FEB')>>> p.asfreq('M',how='start')Period('2006-03', 'M')>>> p.asfreq('M',how='end')Period('2007-02', 'M')111-1115-5954 0 0 0 0 0 13 32954144 32954144 G T exonic BRCA2 . nonsynonymous SNV BRCA2:NM_000059:exon24:c.G9118T:p.V3040F 13q13.1##########pd.value_counts(cats)

 

  

转载于:https://www.cnblogs.com/similarface/p/5397570.html

你可能感兴趣的文章
判断数据库、表和字段是否存在
查看>>
新手安装postgreSQL后无法连接服务器
查看>>
递归和动态规划
查看>>
java实现简单的控制台管理系统
查看>>
建造模式
查看>>
BZOJ 4025: 二分图
查看>>
openNebula rgister img instance vms error collections
查看>>
error Infos
查看>>
PL/sql配置相关
查看>>
[New Portal]Windows Azure Virtual Machine (3) 在VM上挂载磁盘
查看>>
字体随着ProgressBar的加载而滚动
查看>>
Handler 机制再了解
查看>>
如果你是前端工程师,把你的网站或者你知道的网站加进来吧
查看>>
阿里云产品头条(2017年12月刊)
查看>>
探究SQL添加非聚集索引,性能提高几十倍之谜
查看>>
Java 如何不使用 volatile 和锁实现共享变量的同步操作
查看>>
关于ip_conntrack跟踪连接满导致网络丢包问题的分析
查看>>
烂泥:linux学习之VNC远程控制(一)
查看>>
如何解决Xshell使用时中文字体是躺倒显示的问题
查看>>
Scala函数的定义的几种写法
查看>>