In [1]:
import pandas as pd
In [2]:
person = pd.read_csv('../../data/survey_person.csv') # 관측한 사람
site = pd.read_csv('../../data/survey_site.csv') # 관측 위치
survey = pd.read_csv('../../data/survey_survey.csv') # 관측 날짜
visited = pd.read_csv('../../data/survey_visited.csv') # 날씨 정보
person
Out[2]:
ident | personal | family | |
---|---|---|---|
0 | dyer | William | Dyer |
1 | pb | Frank | Pabodie |
2 | lake | Anderson | Lake |
3 | roe | Valentina | Roerich |
4 | danforth | Frank | Danforth |
In [3]:
survey
Out[3]:
taken | person | quant | reading | |
---|---|---|---|---|
0 | 619 | dyer | rad | 9.82 |
1 | 619 | dyer | sal | 0.13 |
2 | 622 | dyer | rad | 7.80 |
3 | 622 | dyer | sal | 0.09 |
4 | 734 | pb | rad | 8.41 |
5 | 734 | lake | sal | 0.05 |
6 | 734 | pb | temp | -21.50 |
7 | 735 | pb | rad | 7.22 |
8 | 735 | NaN | sal | 0.06 |
9 | 735 | NaN | temp | -26.00 |
10 | 751 | pb | rad | 4.35 |
11 | 751 | pb | temp | -18.50 |
12 | 751 | lake | sal | 0.10 |
13 | 752 | lake | rad | 2.19 |
14 | 752 | lake | sal | 0.09 |
15 | 752 | lake | temp | -16.00 |
16 | 752 | roe | sal | 41.60 |
17 | 837 | lake | rad | 1.46 |
18 | 837 | lake | sal | 0.21 |
19 | 837 | roe | sal | 22.50 |
20 | 844 | roe | rad | 11.25 |
In [4]:
site
Out[4]:
name | lat | long | |
---|---|---|---|
0 | DR-1 | -49.85 | -128.57 |
1 | DR-3 | -47.15 | -126.72 |
2 | MSK-4 | -48.87 | -123.40 |
In [5]:
visited
Out[5]:
ident | site | dated | |
---|---|---|---|
0 | 619 | DR-1 | 1927-02-08 |
1 | 622 | DR-1 | 1927-02-10 |
2 | 734 | DR-3 | 1939-01-07 |
3 | 735 | DR-3 | 1930-01-12 |
4 | 751 | DR-3 | 1930-02-26 |
5 | 752 | DR-3 | NaN |
6 | 837 | MSK-4 | 1932-01-14 |
7 | 844 | DR-1 | 1932-03-22 |
In [6]:
# 0,2,6 번째 행의 데이터
visited_subset = visited.loc[[0,2,6], ]
print(visited_subset)
print(site)
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07
6 837 MSK-4 1932-01-14
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
In [7]:
# merge : 기본적으로 내부 조인을 실행하며 메서드를 사용한
# 데이터 프레임(site)을 왼쪽으로 지정, 첫 번째 인자값으로
# 지정한 데이터 프레임(visited_subset)을 오른쪽으로 지정
# 열 이름 일치 - 왼쪽 프레임을 기준으로 연결
o2o_merge = site.merge(visited_subset, left_on='name', \
right_on='site')
print(o2o_merge)
name lat long ident site dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-3 -47.15 -126.72 734 DR-3 1939-01-07
2 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14
In [8]:
m2o_merge = site.merge(visited, left_on='name', right_on='site')
print(m2o_merge)
name lat long ident site dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-1 -49.85 -128.57 622 DR-1 1927-02-10
2 DR-1 -49.85 -128.57 844 DR-1 1932-03-22
3 DR-3 -47.15 -126.72 734 DR-3 1939-01-07
4 DR-3 -47.15 -126.72 735 DR-3 1930-01-12
5 DR-3 -47.15 -126.72 751 DR-3 1930-02-26
6 DR-3 -47.15 -126.72 752 DR-3 NaN
7 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14
In [9]:
ps = person.merge(survey, left_on='ident', right_on='person')
vs = visited.merge(survey, left_on='ident', right_on='taken')
ps
Out[9]:
ident | personal | family | taken | person | quant | reading | |
---|---|---|---|---|---|---|---|
0 | dyer | William | Dyer | 619 | dyer | rad | 9.82 |
1 | dyer | William | Dyer | 619 | dyer | sal | 0.13 |
2 | dyer | William | Dyer | 622 | dyer | rad | 7.80 |
3 | dyer | William | Dyer | 622 | dyer | sal | 0.09 |
4 | pb | Frank | Pabodie | 734 | pb | rad | 8.41 |
5 | pb | Frank | Pabodie | 734 | pb | temp | -21.50 |
6 | pb | Frank | Pabodie | 735 | pb | rad | 7.22 |
7 | pb | Frank | Pabodie | 751 | pb | rad | 4.35 |
8 | pb | Frank | Pabodie | 751 | pb | temp | -18.50 |
9 | lake | Anderson | Lake | 734 | lake | sal | 0.05 |
10 | lake | Anderson | Lake | 751 | lake | sal | 0.10 |
11 | lake | Anderson | Lake | 752 | lake | rad | 2.19 |
12 | lake | Anderson | Lake | 752 | lake | sal | 0.09 |
13 | lake | Anderson | Lake | 752 | lake | temp | -16.00 |
14 | lake | Anderson | Lake | 837 | lake | rad | 1.46 |
15 | lake | Anderson | Lake | 837 | lake | sal | 0.21 |
16 | roe | Valentina | Roerich | 752 | roe | sal | 41.60 |
17 | roe | Valentina | Roerich | 837 | roe | sal | 22.50 |
18 | roe | Valentina | Roerich | 844 | roe | rad | 11.25 |
In [10]:
vs
Out[10]:
ident | site | dated | taken | person | quant | reading | |
---|---|---|---|---|---|---|---|
0 | 619 | DR-1 | 1927-02-08 | 619 | dyer | rad | 9.82 |
1 | 619 | DR-1 | 1927-02-08 | 619 | dyer | sal | 0.13 |
2 | 622 | DR-1 | 1927-02-10 | 622 | dyer | rad | 7.80 |
3 | 622 | DR-1 | 1927-02-10 | 622 | dyer | sal | 0.09 |
4 | 734 | DR-3 | 1939-01-07 | 734 | pb | rad | 8.41 |
5 | 734 | DR-3 | 1939-01-07 | 734 | lake | sal | 0.05 |
6 | 734 | DR-3 | 1939-01-07 | 734 | pb | temp | -21.50 |
7 | 735 | DR-3 | 1930-01-12 | 735 | pb | rad | 7.22 |
8 | 735 | DR-3 | 1930-01-12 | 735 | NaN | sal | 0.06 |
9 | 735 | DR-3 | 1930-01-12 | 735 | NaN | temp | -26.00 |
10 | 751 | DR-3 | 1930-02-26 | 751 | pb | rad | 4.35 |
11 | 751 | DR-3 | 1930-02-26 | 751 | pb | temp | -18.50 |
12 | 751 | DR-3 | 1930-02-26 | 751 | lake | sal | 0.10 |
13 | 752 | DR-3 | NaN | 752 | lake | rad | 2.19 |
14 | 752 | DR-3 | NaN | 752 | lake | sal | 0.09 |
15 | 752 | DR-3 | NaN | 752 | lake | temp | -16.00 |
16 | 752 | DR-3 | NaN | 752 | roe | sal | 41.60 |
17 | 837 | MSK-4 | 1932-01-14 | 837 | lake | rad | 1.46 |
18 | 837 | MSK-4 | 1932-01-14 | 837 | lake | sal | 0.21 |
19 | 837 | MSK-4 | 1932-01-14 | 837 | roe | sal | 22.50 |
20 | 844 | DR-1 | 1932-03-22 | 844 | roe | rad | 11.25 |
In [11]:
# ps 데이터 프레임의 ident, taken, quant, reading 열의 값과 vs 데이터
ps_vs = ps.merge(vs, left_on = \
['ident', 'taken','quant', 'reading'],
right_on =['person', 'ident', 'quant', 'reading'])
ps_vs
# 중복된 열 이름(ident, taken, person)에 접미사 _x(왼쪽), _y(오른쪽)가 추가됨
Out[11]:
ident_x | personal | family | taken_x | person_x | quant | reading | ident_y | site | dated | taken_y | person_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dyer | William | Dyer | 619 | dyer | rad | 9.82 | 619 | DR-1 | 1927-02-08 | 619 | dyer |
1 | dyer | William | Dyer | 619 | dyer | sal | 0.13 | 619 | DR-1 | 1927-02-08 | 619 | dyer |
2 | dyer | William | Dyer | 622 | dyer | rad | 7.80 | 622 | DR-1 | 1927-02-10 | 622 | dyer |
3 | dyer | William | Dyer | 622 | dyer | sal | 0.09 | 622 | DR-1 | 1927-02-10 | 622 | dyer |
4 | pb | Frank | Pabodie | 734 | pb | rad | 8.41 | 734 | DR-3 | 1939-01-07 | 734 | pb |
5 | pb | Frank | Pabodie | 734 | pb | temp | -21.50 | 734 | DR-3 | 1939-01-07 | 734 | pb |
6 | pb | Frank | Pabodie | 735 | pb | rad | 7.22 | 735 | DR-3 | 1930-01-12 | 735 | pb |
7 | pb | Frank | Pabodie | 751 | pb | rad | 4.35 | 751 | DR-3 | 1930-02-26 | 751 | pb |
8 | pb | Frank | Pabodie | 751 | pb | temp | -18.50 | 751 | DR-3 | 1930-02-26 | 751 | pb |
9 | lake | Anderson | Lake | 734 | lake | sal | 0.05 | 734 | DR-3 | 1939-01-07 | 734 | lake |
10 | lake | Anderson | Lake | 751 | lake | sal | 0.10 | 751 | DR-3 | 1930-02-26 | 751 | lake |
11 | lake | Anderson | Lake | 752 | lake | rad | 2.19 | 752 | DR-3 | NaN | 752 | lake |
12 | lake | Anderson | Lake | 752 | lake | sal | 0.09 | 752 | DR-3 | NaN | 752 | lake |
13 | lake | Anderson | Lake | 752 | lake | temp | -16.00 | 752 | DR-3 | NaN | 752 | lake |
14 | lake | Anderson | Lake | 837 | lake | rad | 1.46 | 837 | MSK-4 | 1932-01-14 | 837 | lake |
15 | lake | Anderson | Lake | 837 | lake | sal | 0.21 | 837 | MSK-4 | 1932-01-14 | 837 | lake |
16 | roe | Valentina | Roerich | 752 | roe | sal | 41.60 | 752 | DR-3 | NaN | 752 | roe |
17 | roe | Valentina | Roerich | 837 | roe | sal | 22.50 | 837 | MSK-4 | 1932-01-14 | 837 | roe |
18 | roe | Valentina | Roerich | 844 | roe | rad | 11.25 | 844 | DR-1 | 1932-03-22 | 844 | roe |
반응형
'데이터분석' 카테고리의 다른 글
[23.06.16] Python melt - 11(1) (0) | 2023.06.16 |
---|---|
[23.06.15] Python 누락 값 처리 - 10(3) (0) | 2023.06.15 |
[23.06.15] Python concat - 10(1) (0) | 2023.06.15 |
[23.06.14] Python Seaborn - 09(2) (0) | 2023.06.14 |
[23.06.14] Python Seaborn - 09(1) (0) | 2023.06.14 |