@inproceedings{bd5f7f2b9d3444d08c091ca8cb7a7a3b,
title = "CTextEM: using consolidated textual data for entity matching",
abstract = "Entity Matching (EM) identifies records referring to the same entity within or across databases. Existing methods using structured attribute values (such as digital, date or short string values) only may fail when the structured information is not enough to reflect the matching relationships between records. Nowadays more and more databases may have some unstructured textual attribute containing extra Consolidated Textual information (CText for short) of the record, but seldom work has been done on using the CText information for EM. Conventional string similarity metrics such as edit distance or bag-of-words are unsuitable for measuring the similarities between CTexts since there are hundreds or thousands of words with each CText, while existing topic models either can not work well since there is no obvious gaps between the various sub-topics in CText. In this paper, we work on employing CText in EM. A baseline algorithm identifying important phrases with high IDF scores from CTexts and then measuring the similarity between CTexts based on these phrases does not work well since it estimates the similarity in one dimension and neglects that these phrases belong to different topics. To this end, we propose a novel cooccurrence-based topic model to identify various sub-topics from each CText, and then measure the similarity between CTexts on the multiple sub-topic dimensions. Our empirical study on two real-world data set shows that our method outperforms the state-of-the-art EM methods and Text Understanding models by reaching a higher EM precision and recall.",
keywords = "Consolidated textual data, CTextEM, Entity Matching, IDF score, Interaction, Sub-topic",
author = "Qiang Yang and Zhixu Li and Binbin Gu and An Liu and Guanfeng Liu and Pengpeng Zhao and Lei Zhao",
year = "2016",
doi = "10.1007/978-3-319-32025-0_8",
language = "English",
isbn = "9783319320243",
volume = "9642",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer, Springer Nature",
pages = "117--132",
editor = "Navathe, {Shamkant B.} and Weili Wu and Shashi Shekhar and Xiaoyong Du and Wang, {X. Sean} and Hui Xiong",
booktitle = "Database Systems for Advanced Applications",
address = "United States",
note = "21st International Conference on Database Systems for Advanced Applications, DASFAA 2016 ; Conference date: 16-04-2016 Through 19-04-2016",
}