@inproceedings{79a16d0a1a88465cb06b412c3625facc,
title = "TEXUS: table extraction system for PDF documents",
abstract = "Tables in documents are a rich and under-exploited source of structured data in otherwise unstructured documents. The extraction and understanding of tabular data is a challenging task which has attracted the attention of researchers from a range of disciplines such as information retrieval, machine learning and natural language processing. In this demonstration, we present an end-to-end table extraction and understanding system which takes a PDF file and automatically generates a set of XML and CSV files containing the extracted cells, rows and columns of tables, as well as a complete reading order analysis of the tables. Unlike many systems that work as a black-boxed, ad-hoc solution, our system design incorporates the open, reusable and extensible architecture to support research into, and development of, table-processing systems. During the demo, users will see how our system gradually transforms a PDF document into a set of structured files through a series of processing modules, namely: locating, segmenting and function/structure analysis.",
keywords = "Document processing, Information extraction, Table extraction, Table processing, TEXUS",
author = "Roya Rastan and Paik, {Hye Young} and John Shepherd and Ryu, {Seung Hwan} and Amin Beheshti",
year = "2018",
month = "1",
day = "1",
doi = "10.1007/978-3-319-92013-9_30",
language = "English",
isbn = "9783319920122",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer, Springer Nature",
pages = "345--349",
editor = "Junhu Wang and Gao Cong and Jinjun Chen and Jianzhong Qi",
booktitle = "Databases Theory and Applications",
address = "United States",
}