快速查看整体信息
import numpy as np
import pandas as pd
1
2
2
# .info()
这是 DataFrame 才可用的 API,快捷查看多种信息:总行数和列数、每列元素类型和 non-NaN 的个数,总内存。
DataFrame.info(verbose=None, memory_usage=True, null_counts=True)
- verbose:True or False,字面意思是冗长的,也就说如果 DataFrame 有很多列,是否显示所有列的信息,如果为否,那么会省略一部分;
- memory_usage:True or False,默认为 True,是否查看 DataFrame 的内存使用情况;
- null_counts:True or False,默认为 True,是否统计 NaN 值的个数。
df = pd.DataFrame( columns = range(0,100))
df
1
2
2
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
---|
0 rows × 100 columns
df.info() # 直接默认设置即可
1
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 100 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 0 non-null object
1 1 0 non-null object
2 2 0 non-null object
3 3 0 non-null object
4 4 0 non-null object
5 5 0 non-null object
6 6 0 non-null object
7 7 0 non-null object
8 8 0 non-null object
9 9 0 non-null object
10 10 0 non-null object
11 11 0 non-null object
12 12 0 non-null object
13 13 0 non-null object
14 14 0 non-null object
15 15 0 non-null object
16 16 0 non-null object
17 17 0 non-null object
18 18 0 non-null object
19 19 0 non-null object
20 20 0 non-null object
21 21 0 non-null object
22 22 0 non-null object
23 23 0 non-null object
24 24 0 non-null object
25 25 0 non-null object
26 26 0 non-null object
27 27 0 non-null object
28 28 0 non-null object
29 29 0 non-null object
30 30 0 non-null object
31 31 0 non-null object
32 32 0 non-null object
33 33 0 non-null object
34 34 0 non-null object
35 35 0 non-null object
36 36 0 non-null object
37 37 0 non-null object
38 38 0 non-null object
39 39 0 non-null object
40 40 0 non-null object
41 41 0 non-null object
42 42 0 non-null object
43 43 0 non-null object
44 44 0 non-null object
45 45 0 non-null object
46 46 0 non-null object
47 47 0 non-null object
48 48 0 non-null object
49 49 0 non-null object
50 50 0 non-null object
51 51 0 non-null object
52 52 0 non-null object
53 53 0 non-null object
54 54 0 non-null object
55 55 0 non-null object
56 56 0 non-null object
57 57 0 non-null object
58 58 0 non-null object
59 59 0 non-null object
60 60 0 non-null object
61 61 0 non-null object
62 62 0 non-null object
63 63 0 non-null object
64 64 0 non-null object
65 65 0 non-null object
66 66 0 non-null object
67 67 0 non-null object
68 68 0 non-null object
69 69 0 non-null object
70 70 0 non-null object
71 71 0 non-null object
72 72 0 non-null object
73 73 0 non-null object
74 74 0 non-null object
75 75 0 non-null object
76 76 0 non-null object
77 77 0 non-null object
78 78 0 non-null object
79 79 0 non-null object
80 80 0 non-null object
81 81 0 non-null object
82 82 0 non-null object
83 83 0 non-null object
84 84 0 non-null object
85 85 0 non-null object
86 86 0 non-null object
87 87 0 non-null object
88 88 0 non-null object
89 89 0 non-null object
90 90 0 non-null object
91 91 0 non-null object
92 92 0 non-null object
93 93 0 non-null object
94 94 0 non-null object
95 95 0 non-null object
96 96 0 non-null object
97 97 0 non-null object
98 98 0 non-null object
99 99 0 non-null object
dtypes: object(100)
memory usage: 0.0+ bytes
# .ndim, .shape, .size
查看维数,形状,元素个数。
df = pd.DataFrame( [[np.nan, 2],[3,np.nan]], columns = ['A','B'])
df
1
2
2
A | B | |
---|---|---|
0 | NaN | 2.0 |
1 | 3.0 | NaN |
df.ndim # 返回维度数,Series一维,DataFrame两维,平时很少用到,不过有时会在循环中用到
1
2
df.shape # (行数,列数)
1
(2, 2)
df.size # 元素个数,rows×cols
1
4
# .head(), .tail()
默认分别查看头 5 行和后 5 行。
Series/DataFrame.head(n=5)
Series/DataFrame.tail(n=5)
s = pd.Series( range(0, 20))
s
1
2
2
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
dtype: int64
s.head(3)
1
0 0
1 1
2 2
dtype: int64
s.tail(3)
1
17 17
18 18
19 19
dtype: int64
# .memory_usage()
比 info 中内存显示更可控一些,单位是字节。
Series/DataFrame.memory_usage(index=True, deep=False)
- index:是否显示索引占用的内存,毫无疑问索引也占用内存;
- deep:是否显示 object 类型的列消耗的系统资源,由于 pandas 中 object 元素只是一个引用,我估计这个 deep 是指显示真实的内存占用。
df.memory_usage(deep=False) # Index即索引占用内存
1
Index 128
A 16
B 16
dtype: int64
df.memory_usage(deep=True) # object 型占用的内存变大
1
Index 128
A 16
B 16
dtype: int64
# .describe()
快速查看每一列的统计信息,默认排除所有 NaN 元素。
DataFrame.describe( include= [np.number])
- include:'all'或者[np.number 或 np.object]。numberic 只对元素属性为数值的列做数值统计,object 只对元素属性为 object 的列做类字符串统计。
df = pd.DataFrame( [[1,'a'],[2,'b'],[1,'b']], columns = ['numeric','object'])
df
1
2
2
numeric | object | |
---|---|---|
0 | 1 | a |
1 | 2 | b |
2 | 1 | b |
df.dtypes
1
numeric int64
object object
dtype: object
df.describe() # 默认只对数值列进行统计
1
numeric | |
---|---|
count | 3.000000 |
mean | 1.333333 |
std | 0.577350 |
min | 1.000000 |
25% | 1.000000 |
50% | 1.000000 |
75% | 1.500000 |
max | 2.000000 |
df.describe( include=[np.object]) # 只对object型列进行统计,类别统计方式,只统计这四种
1
object | |
---|---|
count | 3 |
unique | 2 |
top | b |
freq | 2 |
df.describe( include = 'all') # 数值序列和object序列共同统计的信息只有count: non-NaN元素个数
1
numeric | object | |
---|---|---|
count | 3.000000 | 3 |
unique | NaN | 2 |
top | NaN | b |
freq | NaN | 2 |
mean | 1.333333 | NaN |
std | 0.577350 | NaN |
min | 1.000000 | NaN |
25% | 1.000000 | NaN |
50% | 1.000000 | NaN |
75% | 1.500000 | NaN |
max | 2.000000 | NaN |
上次更新: 2023/11/01, 03:11:44