import pandas as pd
from string_grouper import match_strings, match_most_similar, group_similar_strings

1. match_strings(..., ignore_index=[True | False])

test_series_1_nameless = pd.Series(['foo', 'bar', 'baz', 'foo'])

test_series_1_nameless

0    foo
1    bar
2    baz
3    foo
dtype: object

match_strings(test_series_1_nameless)

	left_index	left_side	similarity	right_side	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

match_strings(test_series_1_nameless, ignore_index=True)

	left_side	similarity	right_side
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

test_series_1_named = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow')

test_series_1_named

0    foo
1    bar
2    baz
3    foo
Name: wow, dtype: object

match_strings(test_series_1_named)

	left_index	left_wow	similarity	right_wow	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

match_strings(test_series_1_named, ignore_index=True)

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

test_series_1_nameless_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD'))

test_series_1_nameless_index

A    foo
B    bar
C    baz
D    foo
Name: wow, dtype: object

match_strings(test_series_1_nameless_index)

	left_index	left_wow	similarity	right_wow	right_index
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

match_strings(test_series_1_nameless_index, ignore_index=True)

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

test_series_1_named_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD')).rename_axis('id')

test_series_1_named_index

id
A    foo
B    bar
C    baz
D    foo
Name: wow, dtype: object

match_strings(test_series_1_named_index)

	left_id	left_wow	similarity	right_wow	right_id
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

match_strings(test_series_1_named_index, ignore_index=True)

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

test_df = pd.DataFrame(
    {
        'label': ['foo', 'bar', 'baz', 'foo'],
         'LVL1': list('ABCD'),
         'LVL2': [0, 1, 2, 3]
    }
).set_index(['LVL1', 'LVL2']).squeeze()

test_df

LVL1  LVL2
A     0       foo
B     1       bar
C     2       baz
D     3       foo
Name: label, dtype: object

match_strings(test_df)

	left_LVL1	left_LVL2	left_label	similarity	right_label	right_LVL2	right_LVL1
0	A	0	foo	1.0	foo	0	A
1	A	0	foo	1.0	foo	3	D
2	B	1	bar	1.0	bar	1	B
3	C	2	baz	1.0	baz	2	C
4	D	3	foo	1.0	foo	0	A
5	D	3	foo	1.0	foo	3	D

match_strings(test_df, ignore_index=True)

	left_label	similarity	right_label
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

test_df2 = pd.Series(
    ['foo', 'bar', 'baz', 'foo'], 
    index=pd.MultiIndex.from_tuples(list(zip(list('ABCD'), [0, 1, 2, 3])))
)

test_df2

A  0    foo
B  1    bar
C  2    baz
D  3    foo
dtype: object

match_strings(test_df2)

	left_level_0	left_level_1	left_side	similarity	right_side	right_level_1	right_level_0
0	A	0	foo	1.0	foo	0	A
1	A	0	foo	1.0	foo	3	D
2	B	1	bar	1.0	bar	1	B
3	C	2	baz	1.0	baz	2	C
4	D	3	foo	1.0	foo	0	A
5	D	3	foo	1.0	foo	3	D

match_strings(test_df2, ignore_index=True)

	left_side	similarity	right_side
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

match_strings(test_df2, test_series_1_named)

	left_level_0	left_level_1	left_side	similarity	right_wow	right_index
0	A	0	foo	1.0	foo	3
1	A	0	foo	1.0	foo	0
2	B	1	bar	1.0	bar	1
3	C	2	baz	1.0	baz	2
4	D	3	foo	1.0	foo	3
5	D	3	foo	1.0	foo	0

match_strings(test_df2, test_series_1_named, ignore_index=True)

	left_side	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

2. group_similar_strings(..., ignore_index=[True | False])

Let's import some data:

companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000]

companies_df.squeeze()

	Line Number	Company Name	Company CIK Key
0	1	!J INC	1438823
1	2	#1 A LIFESAFER HOLDINGS, INC.	1509607
2	3	#1 ARIZONA DISCOUNT PROPERTIES LLC	1457512
3	4	#1 PAINTBALL CORP	1433777
4	5	$ LLC	1427189
...	...	...	...
49995	49996	BABB DOUGLAS J	1190359
49996	49997	BABB HENRY C	1193948
49997	49998	BABB INTERNATIONAL INC	1139504
49998	49999	BABB JACK J	1280368
49999	50000	BABB JAMES G. III	1575424

50000 rows × 3 columns

Let's give the data a different (unique-valued) index as is commonly done:

companies_df.set_index(['Line Number', 'Company CIK Key'], inplace=True, verify_integrity=True)

companies_df

		Company Name
Line Number	Company CIK Key
1	1438823	!J INC
2	1509607	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	#1 PAINTBALL CORP
5	1427189	$ LLC
...	...	...
49996	1190359	BABB DOUGLAS J
49997	1193948	BABB HENRY C
49998	1139504	BABB INTERNATIONAL INC
49999	1280368	BABB JACK J
50000	1575424	BABB JAMES G. III

50000 rows × 1 columns

Now let's do some grouping as usual:

companies = companies_df.copy()

group_similar_strings(companies['Company Name'])

		group_rep_Line Number	group_rep_Company CIK Key	group_rep_Company Name
Line Number	Company CIK Key
1	1438823	1	1438823	!J INC
2	1509607	2	1509607	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	4	1433777	#1 PAINTBALL CORP
5	1427189	5	1427189	$ LLC
...	...	...	...	...
49996	1190359	49996	1190359	BABB DOUGLAS J
49997	1193948	49997	1193948	BABB HENRY C
49998	1139504	49998	1139504	BABB INTERNATIONAL INC
49999	1280368	49999	1280368	BABB JACK J
50000	1575424	50000	1575424	BABB JAMES G. III

50000 rows × 3 columns

Notice that group_similar_strings preserves the index of the input Series while also showing the index(es) of the group-representatives in new columns with column-names prefixed by the string "group_rep_".

To ignore the indexes of the group-representatives, simply set the keyword argument ignore_index = True:

group_similar_strings(companies['Company Name'], ignore_index=True)

Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49996        1190359                                BABB DOUGLAS J
49997        1193948                                  BABB HENRY C
49998        1139504                        BABB INTERNATIONAL INC
49999        1280368                                   BABB JACK J
50000        1575424                             BABB JAMES G. III
Name: group_rep_Company Name, Length: 50000, dtype: object

Because the output always inherits the index of the input Series, it is possible to directly assign it to new columns of the companies DataFrame (which has the exact same index) while also giving them new column names, as in the following:

companies[['Group Line Number', 'Group CIK Key', 'Group']] = \
group_similar_strings(companies['Company Name'], min_similarity=0.70)

companies

		Company Name	Group Line Number	Group CIK Key	Group
Line Number	Company CIK Key
1	1438823	!J INC	1	1438823	!J INC
2	1509607	#1 A LIFESAFER HOLDINGS, INC.	2	1509607	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC	3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	#1 PAINTBALL CORP	4	1433777	#1 PAINTBALL CORP
5	1427189	$ LLC	5	1427189	$ LLC
...	...	...	...	...	...
49996	1190359	BABB DOUGLAS J	49996	1190359	BABB DOUGLAS J
49997	1193948	BABB HENRY C	49997	1193948	BABB HENRY C
49998	1139504	BABB INTERNATIONAL INC	49998	1139504	BABB INTERNATIONAL INC
49999	1280368	BABB JACK J	49999	1280368	BABB JACK J
50000	1575424	BABB JAMES G. III	50000	1575424	BABB JAMES G. III

50000 rows × 4 columns

The grouping is not readily seen in the above display. So let us determine the number of members of each group (the group size) and then sort by group size:

companies['Group Size'] = \
companies\
.groupby(['Group Line Number', 'Group CIK Key'], as_index=False)['Company Name']\
.transform('count')
companies.sort_values('Group Size', ascending=False)

		Company Name	Group Line Number	Group CIK Key	Group	Group Size
Line Number	Company CIK Key
14060	1425318	ADVISORS DISCIPLINED TRUST 241	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13845	1662291	ADVISORS DISCIPLINED TRUST 1674	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13821	1662315	ADVISORS DISCIPLINED TRUST 1652	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13822	1662313	ADVISORS DISCIPLINED TRUST 1653	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13823	1662312	ADVISORS DISCIPLINED TRUST 1654	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
...	...	...	...	...	...	...
20536	1529695	ALENTUS CORP	20536	1529695	ALENTUS CORP	1
20535	1585882	ALENT PLC/ADR	20535	1585882	ALENT PLC/ADR	1
20534	1688865	ALENSON CARMAN	20534	1688865	ALENSON CARMAN	1
20533	3433	ALENICK JEROME B	20533	3433	ALENICK JEROME B	1
50000	1575424	BABB JAMES G. III	50000	1575424	BABB JAMES G. III	1

50000 rows × 5 columns

Let's see where the largest group 'ADVISORS DISCIPLINED TRUST' ends:

companies.sort_values('Group Size', ascending=False).iloc[(start:=1861):(start + 10)]

		Company Name	Group Line Number	Group CIK Key	Group	Group Size
Line Number	Company CIK Key
13248	1578605	ADVISORS DISCIPLINED TRUST 1131	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13265	1582225	ADVISORS DISCIPLINED TRUST 1147	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13250	1578617	ADVISORS DISCIPLINED TRUST 1133	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13249	1578604	ADVISORS DISCIPLINED TRUST 1132	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
13264	1582226	ADVISORS DISCIPLINED TRUST 1146	14940	1297377	ADVISORS DISCIPLINED TRUST	1866
32937	1643285	ANGELLIST-SSTY-FUND, A SERIES OF ANGELLIST-FG-...	32535	1600532	ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F...	485
32552	1677061	ANGELLIST-BDID-PR-FUND, A SERIES OF ANGELLIST-...	32535	1600532	ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F...	485
32943	1623545	ANGELLIST-STHE-FUND, A SERIES OF ANGELLIST-GP-...	32535	1600532	ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F...	485
32944	1680681	ANGELLIST-STHE-PR-FUND, A SERIES OF ANGELLIST-...	32535	1600532	ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F...	485
32947	1610185	ANGELLIST-SUIT-FUND, A SERIES OF ANGELLIST-FGR...	32535	1600532	ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F...	485

3. match_most_similar(..., ignore_index=[True | False], replace_na=[True | False])

Now let's create a 'master' Series of group-representatives (we will use it later in the function match_most_similar):

master = companies.groupby(['Group Line Number', 'Group CIK Key'])['Group'].first().head(-5)

Notice that we have intentionally excluded the last five groups.

master.index.rename(['Line Number', 'Company CIK Key'], inplace=True)
master.rename('Company Name', inplace=True)

Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49991        1615648                                BABACAN THOMAS
49992        1443093                            BABAD SHOLOM CHAIM
49993        1208255                                    BABALU LLC
49994        1270229                                  BABANI SUSIE
49995        1660243                                   BABAY KARIM
Name: Company Name, Length: 34249, dtype: object

Let's examine the neighbourhood of the largest group 'ADVISORS DISCIPLINED TRUST':

master.iloc[(start:=(master.index.get_loc((14940, 1297377)) - 5)):(start + 10)]

Line Number  Company CIK Key
13087        1029068                            ADVISORONE FUNDS
13089        1313535                           ADVISORPORT, INC.
13090        1559198                                ADVISORS 999
13094        789623             ADVISORS CAPITAL INVESTMENTS INC
13096        932536              ADVISORS CLEARING NETWORK, INC.
14940        1297377                  ADVISORS DISCIPLINED TRUST
14954        1313525               ADVISORS EDGE SECURITIES, LLC
14955        825201                            ADVISORS FUND L P
14957        1267654                         ADVISORS GENPAR INC
14958        1016084                      ADVISORS GROUP INC /DC
Name: Company Name, dtype: object

Now let's use master in function match_most_similar:

companies = companies_df.copy()

grouped_data = match_most_similar(
    master,
    companies['Company Name'],
    min_similarity=0.55,
    max_n_matches=2000
)

grouped_data

		most_similar_Line Number	most_similar_Company CIK Key	most_similar_Company Name
Line Number	Company CIK Key
1	1438823	1.0	1438823.0	!J INC
2	1509607	2.0	1509607.0	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	3.0	1457512.0	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	4.0	1433777.0	#1 PAINTBALL CORP
5	1427189	5.0	1427189.0	$ LLC
...	...	...	...	...
49996	1190359	31995.0	1336287.0	ANDREA DOUGLAS J
49997	1193948	NaN	NaN	BABB HENRY C
49998	1139504	19399.0	1569329.0	AL INTERNATIONAL, INC.
49999	1280368	NaN	NaN	BABB JACK J
50000	1575424	NaN	NaN	BABB JAMES G. III

50000 rows × 3 columns

Notice that match_most_similar also preserves the indexes of the input duplicates Series.

Also, notice that the indexes of the matched strings (including the matched strings themselves) in master appear in the output as columns whose names are prefixed with the string "most_similar_".

Finally, notice the 'NaN' values in the index columns corresponding to those strings in duplicates that have no match in master (above the similarity threshold specified by the keyword argument setting min_similarity=0.55). Recall that we earlier intentionally excluded certain groups in master. These 'NaN' values are a consequence of that exclusion. Apparently, it is also because of these 'NaN' values that these index columns have been converted into float data-types.

We can replace the 'NaN' values with their corresponding index values by setting the keyword argument replace_na=True. (The reason why the function match_most_similar does not do this by default is because in general duplicates may have a different index from master with possibly a different number of index levels.)

grouped_data = match_most_similar(
    master,
    companies['Company Name'],
    min_similarity=0.55,
    max_n_matches=2000,
    replace_na=True
)

grouped_data

		most_similar_Line Number	most_similar_Company CIK Key	most_similar_Company Name
Line Number	Company CIK Key
1	1438823	1	1438823	!J INC
2	1509607	2	1509607	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	4	1433777	#1 PAINTBALL CORP
5	1427189	5	1427189	$ LLC
...	...	...	...	...
49996	1190359	31995	1336287	ANDREA DOUGLAS J
49997	1193948	49997	1193948	BABB HENRY C
49998	1139504	19399	1569329	AL INTERNATIONAL, INC.
49999	1280368	49999	1280368	BABB JACK J
50000	1575424	50000	1575424	BABB JAMES G. III

50000 rows × 3 columns

Let's inspect the result by determining the group sizes:

grouped_data.groupby('most_similar_Company Name')['most_similar_Line Number'].count()\
.rename('Size')\
.sort_values(ascending=False)

most_similar_Company Name
ADVISORS DISCIPLINED TRUST                                 1866
ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-FUNDS, LLC     279
ALTERNATIVE LOAN TRUST 2005-4                               193
AGL LIFE ASSURANCE CO SEPARATE ACCOUNT                      188
AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2001-1              89
                                                           ... 
AIM SAFETY CO INC                                             1
AIM REAL ESTATE HEDGED EQUITY (U.S.) FUND, LP                 1
AIM QUANTITATIVE GLOBAL SF LP                                 1
AIM OXFORD HOLDINGS, LLC                                      1
TALISMAN ENERGY SWEDEN AB                                     1
Name: Size, Length: 34496, dtype: int64

Just like we did for function group_similar_strings, we can ignore the indexes of master if we choose, by setting ignore_index=True:

grouped_data_dropped = match_most_similar(
    master,
    companies['Company Name'],
    ignore_index=True,
    min_similarity=0.55,
    max_n_matches=2000
)

grouped_data_dropped

Line Number  Company CIK Key
1            1438823                                        !J INC
2            1509607                 #1 A LIFESAFER HOLDINGS, INC.
3            1457512            #1 ARIZONA DISCOUNT PROPERTIES LLC
4            1433777                             #1 PAINTBALL CORP
5            1427189                                         $ LLC
                                               ...                
49996        1190359                              ANDREA DOUGLAS J
49997        1193948                                  BABB HENRY C
49998        1139504                        AL INTERNATIONAL, INC.
49999        1280368                                   BABB JACK J
50000        1575424                             BABB JAMES G. III
Name: most_similar_Company Name, Length: 50000, dtype: object

As before, we can here also directly assign the output to new columns of the companies DataFrame (because it has the exact same index) while also giving them new column names, as in the following:

companies['Most Similar Name'] = grouped_data_dropped

companies

		Company Name	Most Similar Name
Line Number	Company CIK Key
1	1438823	!J INC	!J INC
2	1509607	#1 A LIFESAFER HOLDINGS, INC.	#1 A LIFESAFER HOLDINGS, INC.
3	1457512	#1 ARIZONA DISCOUNT PROPERTIES LLC	#1 ARIZONA DISCOUNT PROPERTIES LLC
4	1433777	#1 PAINTBALL CORP	#1 PAINTBALL CORP
5	1427189	$ LLC	$ LLC
...	...	...	...
49996	1190359	BABB DOUGLAS J	ANDREA DOUGLAS J
49997	1193948	BABB HENRY C	BABB HENRY C
49998	1139504	BABB INTERNATIONAL INC	AL INTERNATIONAL, INC.
49999	1280368	BABB JACK J	BABB JACK J
50000	1575424	BABB JAMES G. III	BABB JAMES G. III

50000 rows × 2 columns

4. StringGrouper(..., ignore_index=[True | False], replace_na=[True | False])

The options ignore_index and replace_na can be passed directly to a StringGrouper object during instantiation. These will be used by its methods StringGrouper.get_groups and StringGrouper.get_matches.

The options ignore_index and replace_na can also, where applicable, be passed directly to StringGrouper.get_groups and StringGrouper.get_matches themselves to temporarily override the StringGrouper-instance's defaults.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ignore_index_and_replace_na.md

ignore_index_and_replace_na.md

1. match_strings(..., ignore_index=[True | False])

2. group_similar_strings(..., ignore_index=[True | False])

3. match_most_similar(..., ignore_index=[True | False], replace_na=[True | False])

4. StringGrouper(..., ignore_index=[True | False], replace_na=[True | False])

	left_index	left_side	similarity	right_side	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_side	similarity	right_side
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_id	left_wow	similarity	right_wow	right_id
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_LVL1	left_LVL2	left_label	similarity	right_label	right_LVL2	right_LVL1
0	A	0	foo	1.0	foo	0	A
1	A	0	foo	1.0	foo	3	D
2	B	1	bar	1.0	bar	1	B
3	C	2	baz	1.0	baz	2	C
4	D	3	foo	1.0	foo	0	A
5	D	3	foo	1.0	foo	3	D

	left_label	similarity	right_label
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_side	similarity	right_side	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_side	similarity	right_side
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_id	left_wow	similarity	right_wow	right_id
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_LVL1	left_LVL2	left_label	similarity	right_label	right_LVL2	right_LVL1
0	A	0	foo	1.0	foo	0	A
1	A	0	foo	1.0	foo	3	D
2	B	1	bar	1.0	bar	1	B
3	C	2	baz	1.0	baz	2	C
4	D	3	foo	1.0	foo	0	A
5	D	3	foo	1.0	foo	3	D

	left_label	similarity	right_label
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

Files

ignore_index_and_replace_na.md

Latest commit

History

ignore_index_and_replace_na.md

File metadata and controls

1. match_strings(..., ignore_index=[True | False])

2. group_similar_strings(..., ignore_index=[True | False])

3. match_most_similar(..., ignore_index=[True | False], replace_na=[True | False])

4. StringGrouper(..., ignore_index=[True | False], replace_na=[True | False])

	left_index	left_side	similarity	right_side	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_side	similarity	right_side
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	0	foo	1.0	foo	0
1	0	foo	1.0	foo	3
2	1	bar	1.0	bar	1
3	2	baz	1.0	baz	2
4	3	foo	1.0	foo	0
5	3	foo	1.0	foo	3

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_index	left_wow	similarity	right_wow	right_index
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_id	left_wow	similarity	right_wow	right_id
0	A	foo	1.0	foo	A
1	A	foo	1.0	foo	D
2	B	bar	1.0	bar	B
3	C	baz	1.0	baz	C
4	D	foo	1.0	foo	A
5	D	foo	1.0	foo	D

	left_wow	similarity	right_wow
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo

	left_LVL1	left_LVL2	left_label	similarity	right_label	right_LVL2	right_LVL1
0	A	0	foo	1.0	foo	0	A
1	A	0	foo	1.0	foo	3	D
2	B	1	bar	1.0	bar	1	B
3	C	2	baz	1.0	baz	2	C
4	D	3	foo	1.0	foo	0	A
5	D	3	foo	1.0	foo	3	D

	left_label	similarity	right_label
0	foo	1.0	foo
1	foo	1.0	foo
2	bar	1.0	bar
3	baz	1.0	baz
4	foo	1.0	foo
5	foo	1.0	foo