python-pandas基礎操作

簡單的紀錄一下pandas觀看數據以及load數據的方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#data source:http://archive.ics.uci.edu/ml/datasets/Iris
import urllib3
##from bs4 import BeautifulSoup
import pandas as pd
import io
http = urllib3.PoolManager()
url= http.request('GET', 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')

#通常如果文件中含有列標題可以使用
#可以在read_csv使用參數parse_dates=[0]讀取文件第一行的字串來當作names
#然後header=None不寫
column_name=[
'sepal_length_in_cm',
'sepal_width_in_cm',
'petal_length_in_cm',
'petal_width_in_cm',
'class'
]
#取得資料
iris=pd.read_csv(io.StringIO(url.data.decode("utf-8")),header=None,names=column_name)
1
2
#資料前部分(可以指定顯示行數,預設5)
iris.head(3)
sepal_length_in_cmsepal_width_in_cmpetal_length_in_cmpetal_width_in_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
1
2
#資料後部分(可以指定顯示行數,預設5)
iris.tail(3)
sepal_length_in_cmsepal_width_in_cmpetal_length_in_cmpetal_width_in_cmclass
1476.53.05.22.0Iris-virginica
1486.23.45.42.3Iris-virginica
1495.93.05.11.8Iris-virginica
1
2
#資料摘要
iris.describe()
sepal_length_in_cmsepal_width_in_cmpetal_length_in_cmpetal_width_in_cm
count150.000000150.000000150.000000150.000000
mean5.8433333.0540003.7586671.198667
std0.8280660.4335941.7644200.763161
min4.3000002.0000001.0000000.100000
25%5.1000002.8000001.6000000.300000
50%5.8000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000
1
2
#選取特定列
iris['sepal_length_in_cm'].head()
0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length_in_cm, dtype: float64
1
2
#選取特定多個列
iris[['sepal_length_in_cm',"class"]].head()
sepal_length_in_cmclass
05.1Iris-setosa
14.9Iris-setosa
24.7Iris-setosa
34.6Iris-setosa
45.0Iris-setosa
1
2
3
4
5
6
7
8
9
10
11
#在讀取的時候如果數據集太大的讀取
#可以使用chunk的方式去做
#chunk就是一塊快的數據快
chunks=pd.read_csv(io.StringIO(url.data.decode("utf-8")),header=None,names=column_name,chunksize=10)
#可以使用這樣印出所有
#for chunk in chunks:
# print(chunk)
#以這裡就是10個一組
#這裡印出其中一組

print(chunks.get_chunk(10))
   sepal_length_in_cm  sepal_width_in_cm  petal_length_in_cm  \
0                 5.1                3.5                 1.4   
1                 4.9                3.0                 1.4   
2                 4.7                3.2                 1.3   
3                 4.6                3.1                 1.5   
4                 5.0                3.6                 1.4   
5                 5.4                3.9                 1.7   
6                 4.6                3.4                 1.4   
7                 5.0                3.4                 1.5   
8                 4.4                2.9                 1.4   
9                 4.9                3.1                 1.5   

   petal_width_in_cm        class  
0                0.2  Iris-setosa  
1                0.2  Iris-setosa  
2                0.2  Iris-setosa  
3                0.2  Iris-setosa  
4                0.2  Iris-setosa  
5                0.4  Iris-setosa  
6                0.3  Iris-setosa  
7                0.2  Iris-setosa  
8                0.2  Iris-setosa  
9                0.1  Iris-setosa  
1
2
3
4
#當真的在使用chunk讀取非常大的數據集時
#分配一個iterator將會 動態分配每個pandas的數據框長度
iris_iterator=pd.read_csv(io.StringIO(url.data.decode("utf-8")),header=None,names=column_name,chunksize=10)
iris_iterator.get_chunk(10)
sepal_length_in_cmsepal_width_in_cmpetal_length_in_cmpetal_width_in_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
55.43.91.70.4Iris-setosa
64.63.41.40.3Iris-setosa
75.03.41.50.2Iris-setosa
84.42.91.40.2Iris-setosa
94.93.11.50.1Iris-setosa

本文地址: https://david6686.github.io/blog/silentink/10255/

加個打賞功能,說不定真的有好心人~