df.duplicated()
→ 몇번째 인덱스가 중복되는지를 출력한다.
정확히 일치하는 인덱스만 True로 반환한다.
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
{'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
{'name': 'Abraham', 'major': "Physics", 'sex': "male"},
{'name': 'Brian', 'major': "Psychology", 'sex': "male"},
{'name': 'Janny', 'major': "Economics", 'sex': "female"},
{'name': 'Yuna', 'major': "Economics", 'sex': "female"},
{'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
{'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
{'name': 'Zara', 'major': "Psychology", 'sex': "female"},
{'name': 'Wendy', 'major': "Economics", 'sex': "female"},
{'name': 'Sera', 'major': "Psychology", 'sex': "female"},
{'name': 'John', 'major': "Computer Science", 'sex': "male"},
]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 True
dtype: bool
df.drop_duplicates()
→ 중복되는 데이터 제거
df.drop_duplicates()
name major sex
0 John Computer Science male
1 Nate Computer Science male
2 Abraham Physics male
3 Brian Psychology male
4 Janny Economics female
5 Yuna Economics female
6 Jeniffer Computer Science female
7 Edward Computer Science male
8 Zara Psychology female
9 Wendy Economics female
10 Sera Psychology female
행의 모든 값이 같은게 아니라 선택한 컬럼 값이 같을 경우 표시하기
df.duplicated(['name'])
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 True
dtype: bool
→ 이름이 같은 데이터를 True라고 출력한다.
df.drop_duplicates(['name'], keep='last')
name major sex
1 Nate Computer Science male
2 Abraham Physics male
3 Brian Psychology male
4 Janny Economics female
5 Yuna Economics female
6 Jeniffer Computer Science female
7 Edward Computer Science male
8 Zara Psychology female
9 Wendy Economics female
10 Sera Psychology female
11 John Computer Science male
keep을 입력하지 않을 경우 default로 first로 처리된다.