Skip to content

Latest commit

 

History

History
2361 lines (2035 loc) · 43.4 KB

ignore_index_and_replace_na.md

File metadata and controls

2361 lines (2035 loc) · 43.4 KB
import pandas as pd
from string_grouper import match_strings, match_most_similar, group_similar_strings

1. match_strings(..., ignore_index=[True | False])

test_series_1_nameless = pd.Series(['foo', 'bar', 'baz', 'foo'])
test_series_1_nameless
0    foo
1    bar
2    baz
3    foo
dtype: object
match_strings(test_series_1_nameless)
left_index left_side similarity right_side right_index
0 0 foo 1.0 foo 0
1 0 foo 1.0 foo 3
2 1 bar 1.0 bar 1
3 2 baz 1.0 baz 2
4 3 foo 1.0 foo 0
5 3 foo 1.0 foo 3
match_strings(test_series_1_nameless, ignore_index=True)
left_side similarity right_side
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
test_series_1_named = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow')
test_series_1_named
0    foo
1    bar
2    baz
3    foo
Name: wow, dtype: object
match_strings(test_series_1_named)
left_index left_wow similarity right_wow right_index
0 0 foo 1.0 foo 0
1 0 foo 1.0 foo 3
2 1 bar 1.0 bar 1
3 2 baz 1.0 baz 2
4 3 foo 1.0 foo 0
5 3 foo 1.0 foo 3
match_strings(test_series_1_named, ignore_index=True)
left_wow similarity right_wow
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
test_series_1_nameless_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD'))
test_series_1_nameless_index
A    foo
B    bar
C    baz
D    foo
Name: wow, dtype: object
match_strings(test_series_1_nameless_index)
left_index left_wow similarity right_wow right_index
0 A foo 1.0 foo A
1 A foo 1.0 foo D
2 B bar 1.0 bar B
3 C baz 1.0 baz C
4 D foo 1.0 foo A
5 D foo 1.0 foo D
match_strings(test_series_1_nameless_index, ignore_index=True)
left_wow similarity right_wow
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
test_series_1_named_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD')).rename_axis('id')
test_series_1_named_index
id
A    foo
B    bar
C    baz
D    foo
Name: wow, dtype: object
match_strings(test_series_1_named_index)
left_id left_wow similarity right_wow right_id
0 A foo 1.0 foo A
1 A foo 1.0 foo D
2 B bar 1.0 bar B
3 C baz 1.0 baz C
4 D foo 1.0 foo A
5 D foo 1.0 foo D
match_strings(test_series_1_named_index, ignore_index=True)
left_wow similarity right_wow
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
test_df = pd.DataFrame(
    {
        'label': ['foo', 'bar', 'baz', 'foo'],
         'LVL1': list('ABCD'),
         'LVL2': [0, 1, 2, 3]
    }
).set_index(['LVL1', 'LVL2']).squeeze()
test_df
LVL1  LVL2
A     0       foo
B     1       bar
C     2       baz
D     3       foo
Name: label, dtype: object
match_strings(test_df)
left_LVL1 left_LVL2 left_label similarity right_label right_LVL2 right_LVL1
0 A 0 foo 1.0 foo 0 A
1 A 0 foo 1.0 foo 3 D
2 B 1 bar 1.0 bar 1 B
3 C 2 baz 1.0 baz 2 C
4 D 3 foo 1.0 foo 0 A
5 D 3 foo 1.0 foo 3 D
match_strings(test_df, ignore_index=True)
left_label similarity right_label
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
test_df2 = pd.Series(
    ['foo', 'bar', 'baz', 'foo'], 
    index=pd.MultiIndex.from_tuples(list(zip(list('ABCD'), [0, 1, 2, 3])))
)
test_df2
A  0    foo
B  1    bar
C  2    baz
D  3    foo
dtype: object
match_strings(test_df2)
left_level_0 left_level_1 left_side similarity right_side right_level_1 right_level_0
0 A 0 foo 1.0 foo 0 A
1 A 0 foo 1.0 foo 3 D
2 B 1 bar 1.0 bar 1 B
3 C 2 baz 1.0 baz 2 C
4 D 3 foo 1.0 foo 0 A
5 D 3 foo 1.0 foo 3 D
match_strings(test_df2, ignore_index=True)
left_side similarity right_side
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo
match_strings(test_df2, test_series_1_named)
left_level_0 left_level_1 left_side similarity right_wow right_index
0 A 0 foo 1.0 foo 3
1 A 0 foo 1.0 foo 0
2 B 1 bar 1.0 bar 1
3 C 2 baz 1.0 baz 2
4 D 3 foo 1.0 foo 3
5 D 3 foo 1.0 foo 0
match_strings(test_df2, test_series_1_named, ignore_index=True)
left_side similarity right_wow
0 foo 1.0 foo
1 foo 1.0 foo
2 bar 1.0 bar
3 baz 1.0 baz
4 foo 1.0 foo
5 foo 1.0 foo

2. group_similar_strings(..., ignore_index=[True | False])

Let's import some data:

companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000]
companies_df.squeeze()
Line Number Company Name Company CIK Key
0 1 !J INC 1438823
1 2 #1 A LIFESAFER HOLDINGS, INC. 1509607
2 3 #1 ARIZONA DISCOUNT PROPERTIES LLC 1457512
3 4 #1 PAINTBALL CORP 1433777
4 5 $ LLC 1427189
... ... ... ...
49995 49996 BABB DOUGLAS J 1190359
49996 49997 BABB HENRY C 1193948
49997 49998 BABB INTERNATIONAL INC 1139504
49998 49999 BABB JACK J 1280368
49999 50000 BABB JAMES G. III 1575424

50000 rows × 3 columns

Let's give the data a different (unique-valued) index as is commonly done:

companies_df.set_index(['Line Number', 'Company CIK Key'], inplace=True, verify_integrity=True)
companies_df
Company Name
Line Number Company CIK Key
1 1438823 !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP
5 1427189 $ LLC
... ... ...
49996 1190359 BABB DOUGLAS J
49997 1193948 BABB HENRY C
49998 1139504 BABB INTERNATIONAL INC
49999 1280368 BABB JACK J
50000 1575424 BABB JAMES G. III

50000 rows × 1 columns

Now let's do some grouping as usual:

companies = companies_df.copy()
group_similar_strings(companies['Company Name'])
group_rep_Line Number group_rep_Company CIK Key group_rep_Company Name
Line Number Company CIK Key
1 1438823 1 1438823 !J INC
2 1509607 2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 4 1433777 #1 PAINTBALL CORP
5 1427189 5 1427189 $ LLC
... ... ... ... ...
49996 1190359 49996 1190359 BABB DOUGLAS J
49997 1193948 49997 1193948 BABB HENRY C
49998 1139504 49998 1139504 BABB INTERNATIONAL INC
49999 1280368 49999 1280368 BABB JACK J
50000 1575424 50000 1575424 BABB JAMES G. III

50000 rows × 3 columns

Notice that group_similar_strings preserves the index of the input Series while also showing the index(es) of the group-representatives in new columns with column-names prefixed by the string "group_rep_".

To ignore the indexes of the group-representatives, simply set the keyword argument ignore_index = True:

group_similar_strings(companies['Company Name'], ignore_index=True)
Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49996        1190359                                BABB DOUGLAS J
49997        1193948                                  BABB HENRY C
49998        1139504                        BABB INTERNATIONAL INC
49999        1280368                                   BABB JACK J
50000        1575424                             BABB JAMES G. III
Name: group_rep_Company Name, Length: 50000, dtype: object

Because the output always inherits the index of the input Series, it is possible to directly assign it to new columns of the companies DataFrame (which has the exact same index) while also giving them new column names, as in the following:

companies[['Group Line Number', 'Group CIK Key', 'Group']] = \
group_similar_strings(companies['Company Name'], min_similarity=0.70)
companies
Company Name Group Line Number Group CIK Key Group
Line Number Company CIK Key
1 1438823 !J INC 1 1438823 !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC. 2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC 3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP 4 1433777 #1 PAINTBALL CORP
5 1427189 $ LLC 5 1427189 $ LLC
... ... ... ... ... ...
49996 1190359 BABB DOUGLAS J 49996 1190359 BABB DOUGLAS J
49997 1193948 BABB HENRY C 49997 1193948 BABB HENRY C
49998 1139504 BABB INTERNATIONAL INC 49998 1139504 BABB INTERNATIONAL INC
49999 1280368 BABB JACK J 49999 1280368 BABB JACK J
50000 1575424 BABB JAMES G. III 50000 1575424 BABB JAMES G. III

50000 rows × 4 columns

The grouping is not readily seen in the above display. So let us determine the number of members of each group (the group size) and then sort by group size:

companies['Group Size'] = \
companies\
.groupby(['Group Line Number', 'Group CIK Key'], as_index=False)['Company Name']\
.transform('count')
companies.sort_values('Group Size', ascending=False)
Company Name Group Line Number Group CIK Key Group Group Size
Line Number Company CIK Key
14060 1425318 ADVISORS DISCIPLINED TRUST 241 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13845 1662291 ADVISORS DISCIPLINED TRUST 1674 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13821 1662315 ADVISORS DISCIPLINED TRUST 1652 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13822 1662313 ADVISORS DISCIPLINED TRUST 1653 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13823 1662312 ADVISORS DISCIPLINED TRUST 1654 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
... ... ... ... ... ... ...
20536 1529695 ALENTUS CORP 20536 1529695 ALENTUS CORP 1
20535 1585882 ALENT PLC/ADR 20535 1585882 ALENT PLC/ADR 1
20534 1688865 ALENSON CARMAN 20534 1688865 ALENSON CARMAN 1
20533 3433 ALENICK JEROME B 20533 3433 ALENICK JEROME B 1
50000 1575424 BABB JAMES G. III 50000 1575424 BABB JAMES G. III 1

50000 rows × 5 columns

Let's see where the largest group 'ADVISORS DISCIPLINED TRUST' ends:

companies.sort_values('Group Size', ascending=False).iloc[(start:=1861):(start + 10)]
Company Name Group Line Number Group CIK Key Group Group Size
Line Number Company CIK Key
13248 1578605 ADVISORS DISCIPLINED TRUST 1131 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13265 1582225 ADVISORS DISCIPLINED TRUST 1147 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13250 1578617 ADVISORS DISCIPLINED TRUST 1133 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13249 1578604 ADVISORS DISCIPLINED TRUST 1132 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
13264 1582226 ADVISORS DISCIPLINED TRUST 1146 14940 1297377 ADVISORS DISCIPLINED TRUST 1866
32937 1643285 ANGELLIST-SSTY-FUND, A SERIES OF ANGELLIST-FG-... 32535 1600532 ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... 485
32552 1677061 ANGELLIST-BDID-PR-FUND, A SERIES OF ANGELLIST-... 32535 1600532 ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... 485
32943 1623545 ANGELLIST-STHE-FUND, A SERIES OF ANGELLIST-GP-... 32535 1600532 ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... 485
32944 1680681 ANGELLIST-STHE-PR-FUND, A SERIES OF ANGELLIST-... 32535 1600532 ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... 485
32947 1610185 ANGELLIST-SUIT-FUND, A SERIES OF ANGELLIST-FGR... 32535 1600532 ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... 485

3. match_most_similar(..., ignore_index=[True | False], replace_na=[True | False])

Now let's create a 'master' Series of group-representatives (we will use it later in the function match_most_similar):

master = companies.groupby(['Group Line Number', 'Group CIK Key'])['Group'].first().head(-5)

Notice that we have intentionally excluded the last five groups.

master.index.rename(['Line Number', 'Company CIK Key'], inplace=True)
master.rename('Company Name', inplace=True)
Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49991        1615648                                BABACAN THOMAS
49992        1443093                            BABAD SHOLOM CHAIM
49993        1208255                                    BABALU LLC
49994        1270229                                  BABANI SUSIE
49995        1660243                                   BABAY KARIM
Name: Company Name, Length: 34249, dtype: object

Let's examine the neighbourhood of the largest group 'ADVISORS DISCIPLINED TRUST':

master.iloc[(start:=(master.index.get_loc((14940, 1297377)) - 5)):(start + 10)]
Line Number  Company CIK Key
13087        1029068                            ADVISORONE FUNDS
13089        1313535                           ADVISORPORT, INC.
13090        1559198                                ADVISORS 999
13094        789623             ADVISORS CAPITAL INVESTMENTS INC
13096        932536              ADVISORS CLEARING NETWORK, INC.
14940        1297377                  ADVISORS DISCIPLINED TRUST
14954        1313525               ADVISORS EDGE SECURITIES, LLC
14955        825201                            ADVISORS FUND L P
14957        1267654                         ADVISORS GENPAR INC
14958        1016084                      ADVISORS GROUP INC /DC
Name: Company Name, dtype: object

Now let's use master in function match_most_similar:

companies = companies_df.copy()
grouped_data = match_most_similar(
    master,
    companies['Company Name'],
    min_similarity=0.55,
    max_n_matches=2000
)

grouped_data
most_similar_Line Number most_similar_Company CIK Key most_similar_Company Name
Line Number Company CIK Key
1 1438823 1.0 1438823.0 !J INC
2 1509607 2.0 1509607.0 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 3.0 1457512.0 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 4.0 1433777.0 #1 PAINTBALL CORP
5 1427189 5.0 1427189.0 $ LLC
... ... ... ... ...
49996 1190359 31995.0 1336287.0 ANDREA DOUGLAS J
49997 1193948 NaN NaN BABB HENRY C
49998 1139504 19399.0 1569329.0 AL INTERNATIONAL, INC.
49999 1280368 NaN NaN BABB JACK J
50000 1575424 NaN NaN BABB JAMES G. III

50000 rows × 3 columns

Notice that match_most_similar also preserves the indexes of the input duplicates Series.

Also, notice that the indexes of the matched strings (including the matched strings themselves) in master appear in the output as columns whose names are prefixed with the string "most_similar_".

Finally, notice the 'NaN' values in the index columns corresponding to those strings in duplicates that have no match in master (above the similarity threshold specified by the keyword argument setting min_similarity=0.55). Recall that we earlier intentionally excluded certain groups in master. These 'NaN' values are a consequence of that exclusion. Apparently, it is also because of these 'NaN' values that these index columns have been converted into float data-types.

We can replace the 'NaN' values with their corresponding index values by setting the keyword argument replace_na=True. (The reason why the function match_most_similar does not do this by default is because in general duplicates may have a different index from master with possibly a different number of index levels.)

grouped_data = match_most_similar(
    master,
    companies['Company Name'],
    min_similarity=0.55,
    max_n_matches=2000,
    replace_na=True
)

grouped_data
most_similar_Line Number most_similar_Company CIK Key most_similar_Company Name
Line Number Company CIK Key
1 1438823 1 1438823 !J INC
2 1509607 2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 4 1433777 #1 PAINTBALL CORP
5 1427189 5 1427189 $ LLC
... ... ... ... ...
49996 1190359 31995 1336287 ANDREA DOUGLAS J
49997 1193948 49997 1193948 BABB HENRY C
49998 1139504 19399 1569329 AL INTERNATIONAL, INC.
49999 1280368 49999 1280368 BABB JACK J
50000 1575424 50000 1575424 BABB JAMES G. III

50000 rows × 3 columns

Let's inspect the result by determining the group sizes:

grouped_data.groupby('most_similar_Company Name')['most_similar_Line Number'].count()\
.rename('Size')\
.sort_values(ascending=False)
most_similar_Company Name
ADVISORS DISCIPLINED TRUST                                 1866
ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-FUNDS, LLC     279
ALTERNATIVE LOAN TRUST 2005-4                               193
AGL LIFE ASSURANCE CO SEPARATE ACCOUNT                      188
AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2001-1              89
                                                           ... 
AIM SAFETY CO INC                                             1
AIM REAL ESTATE HEDGED EQUITY (U.S.) FUND, LP                 1
AIM QUANTITATIVE GLOBAL SF LP                                 1
AIM OXFORD HOLDINGS, LLC                                      1
TALISMAN ENERGY SWEDEN AB                                     1
Name: Size, Length: 34496, dtype: int64

Just like we did for function group_similar_strings, we can ignore the indexes of master if we choose, by setting ignore_index=True:

grouped_data_dropped = match_most_similar(
    master,
    companies['Company Name'],
    ignore_index=True,
    min_similarity=0.55,
    max_n_matches=2000
)
grouped_data_dropped
Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49996        1190359                              ANDREA DOUGLAS J
49997        1193948                                  BABB HENRY C
49998        1139504                        AL INTERNATIONAL, INC.
49999        1280368                                   BABB JACK J
50000        1575424                             BABB JAMES G. III
Name: most_similar_Company Name, Length: 50000, dtype: object

As before, we can here also directly assign the output to new columns of the companies DataFrame (because it has the exact same index) while also giving them new column names, as in the following:

companies['Most Similar Name'] = grouped_data_dropped
companies
Company Name Most Similar Name
Line Number Company CIK Key
1 1438823 !J INC !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC. #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP #1 PAINTBALL CORP
5 1427189 $ LLC $ LLC
... ... ... ...
49996 1190359 BABB DOUGLAS J ANDREA DOUGLAS J
49997 1193948 BABB HENRY C BABB HENRY C
49998 1139504 BABB INTERNATIONAL INC AL INTERNATIONAL, INC.
49999 1280368 BABB JACK J BABB JACK J
50000 1575424 BABB JAMES G. III BABB JAMES G. III

50000 rows × 2 columns

4. StringGrouper(..., ignore_index=[True | False], replace_na=[True | False])

The options ignore_index and replace_na can be passed directly to a StringGrouper object during instantiation. These will be used by its methods StringGrouper.get_groups and StringGrouper.get_matches.

The options ignore_index and replace_na can also, where applicable, be passed directly to StringGrouper.get_groups and StringGrouper.get_matches themselves to temporarily override the StringGrouper-instance's defaults.