import pandas as pd
import numpy as np
from string_grouper import StringGrouper
companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000]
master = companies_df['Company Name']
master_id = companies_df['Line Number']
duplicates = pd.Series(["ADVISORS DISCIPLINED TRUST", "ADVISORS DISCIPLINED TRUST '18"])
duplicates_id = pd.Series([3, 5])
By default, zero-similarity matches are found and output when min_similarity = 0
:
string_grouper = StringGrouper(
master = master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
ignore_index=True,
min_similarity = 0,
max_n_matches = 10000,
regex = "[,-./#]"
).fit()
string_grouper.get_matches()
left_Company Name | left_Line Number | similarity | right_id | right_side | |
---|---|---|---|---|---|
0 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.091157 | 3 | ADVISORS DISCIPLINED TRUST |
1 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.063861 | 5 | ADVISORS DISCIPLINED TRUST '18 |
2 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.015313 | 3 | ADVISORS DISCIPLINED TRUST |
3 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.010728 | 5 | ADVISORS DISCIPLINED TRUST '18 |
4 | 05 DIXIE UNION/UNDER FIRE LLC | 22 | 0.025397 | 3 | ADVISORS DISCIPLINED TRUST |
... | ... | ... | ... | ... | ... |
99995 | ALLDREDGE WILLIAM T | 21746 | 0.000000 | 3 | ADVISORS DISCIPLINED TRUST |
99996 | ALLEN SAMUEL R | 22183 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99997 | ATSP INNOVATIONS, LLC | 45273 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99998 | ATLAS IDF, LP | 44877 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99999 | AU LEO Y | 45535 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
100000 rows × 5 columns
string_grouper = StringGrouper(
master = master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
ignore_index=True,
min_similarity = 0,
max_n_matches = 10000,
regex = "[,-./#]",
include_zeroes = False
).fit()
string_grouper.get_matches()
left_Company Name | left_Line Number | similarity | right_id | right_side | |
---|---|---|---|---|---|
0 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.091157 | 3 | ADVISORS DISCIPLINED TRUST |
1 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.063861 | 5 | ADVISORS DISCIPLINED TRUST '18 |
2 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.015313 | 3 | ADVISORS DISCIPLINED TRUST |
3 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.010728 | 5 | ADVISORS DISCIPLINED TRUST '18 |
4 | 05 DIXIE UNION/UNDER FIRE LLC | 22 | 0.025397 | 3 | ADVISORS DISCIPLINED TRUST |
... | ... | ... | ... | ... | ... |
28754 | BAAPLIFE3-2015, LLC | 49976 | 0.021830 | 5 | ADVISORS DISCIPLINED TRUST '18 |
28755 | BAAPLIFE4-2016, LLC | 49977 | 0.030983 | 3 | ADVISORS DISCIPLINED TRUST |
28756 | BAAPLIFE4-2016, LLC | 49977 | 0.021706 | 5 | ADVISORS DISCIPLINED TRUST '18 |
28757 | BABA JOE DIAMOND VENTURES US INC. | 49989 | 0.027064 | 3 | ADVISORS DISCIPLINED TRUST |
28758 | BABA JOE DIAMOND VENTURES US INC. | 49989 | 0.018960 | 5 | ADVISORS DISCIPLINED TRUST '18 |
28759 rows × 5 columns
string_grouper.get_matches(include_zeroes=True)
left_Company Name | left_Line Number | similarity | right_id | right_side | |
---|---|---|---|---|---|
0 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.091157 | 3 | ADVISORS DISCIPLINED TRUST |
1 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 0.063861 | 5 | ADVISORS DISCIPLINED TRUST '18 |
2 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.015313 | 3 | ADVISORS DISCIPLINED TRUST |
3 | 05 CAT THIEF/GOLD IN MY STARS LLC | 21 | 0.010728 | 5 | ADVISORS DISCIPLINED TRUST '18 |
4 | 05 DIXIE UNION/UNDER FIRE LLC | 22 | 0.025397 | 3 | ADVISORS DISCIPLINED TRUST |
... | ... | ... | ... | ... | ... |
99995 | ALLDREDGE WILLIAM T | 21746 | 0.000000 | 3 | ADVISORS DISCIPLINED TRUST |
99996 | ALLEN SAMUEL R | 22183 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99997 | ATSP INNOVATIONS, LLC | 45273 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99998 | ATLAS IDF, LP | 44877 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
99999 | AU LEO Y | 45535 | 0.000000 | 5 | ADVISORS DISCIPLINED TRUST '18 |
100000 rows × 5 columns
Default indexes are output:
string_grouper = StringGrouper(
master = master,
duplicates=duplicates,
min_similarity = 0,
max_n_matches = 10000,
regex = "[,-./#]"
).fit()
string_grouper.get_matches()
left_index | left_Company Name | similarity | right_side | right_index | |
---|---|---|---|---|---|
0 | 2 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 0.091157 | ADVISORS DISCIPLINED TRUST | 0 |
1 | 2 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 0.063861 | ADVISORS DISCIPLINED TRUST '18 | 1 |
2 | 20 | 05 CAT THIEF/GOLD IN MY STARS LLC | 0.015313 | ADVISORS DISCIPLINED TRUST | 0 |
3 | 20 | 05 CAT THIEF/GOLD IN MY STARS LLC | 0.010728 | ADVISORS DISCIPLINED TRUST '18 | 1 |
4 | 21 | 05 DIXIE UNION/UNDER FIRE LLC | 0.025397 | ADVISORS DISCIPLINED TRUST | 0 |
... | ... | ... | ... | ... | ... |
99995 | 21745 | ALLDREDGE WILLIAM T | 0.000000 | ADVISORS DISCIPLINED TRUST | 0 |
99996 | 22182 | ALLEN SAMUEL R | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 1 |
99997 | 45272 | ATSP INNOVATIONS, LLC | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 1 |
99998 | 44876 | ATLAS IDF, LP | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 1 |
99999 | 45534 | AU LEO Y | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 1 |
100000 rows × 5 columns
Indexes are output:
master.index = pd.Index(master_id)
duplicates.index = pd.Index(duplicates_id)
string_grouper = StringGrouper(
master = master,
duplicates=duplicates,
min_similarity = 0,
max_n_matches = 10000,
regex = "[,-./#]"
).fit()
string_grouper.get_matches()
left_Line Number | left_Company Name | similarity | right_side | right_index | |
---|---|---|---|---|---|
0 | 3 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 0.091157 | ADVISORS DISCIPLINED TRUST | 3 |
1 | 3 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 0.063861 | ADVISORS DISCIPLINED TRUST '18 | 5 |
2 | 21 | 05 CAT THIEF/GOLD IN MY STARS LLC | 0.015313 | ADVISORS DISCIPLINED TRUST | 3 |
3 | 21 | 05 CAT THIEF/GOLD IN MY STARS LLC | 0.010728 | ADVISORS DISCIPLINED TRUST '18 | 5 |
4 | 22 | 05 DIXIE UNION/UNDER FIRE LLC | 0.025397 | ADVISORS DISCIPLINED TRUST | 3 |
... | ... | ... | ... | ... | ... |
99995 | 21746 | ALLDREDGE WILLIAM T | 0.000000 | ADVISORS DISCIPLINED TRUST | 3 |
99996 | 22183 | ALLEN SAMUEL R | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 5 |
99997 | 45273 | ATSP INNOVATIONS, LLC | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 5 |
99998 | 44877 | ATLAS IDF, LP | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 5 |
99999 | 45535 | AU LEO Y | 0.000000 | ADVISORS DISCIPLINED TRUST '18 | 5 |
100000 rows × 5 columns