@inproceedings{maxwell-bills-2017-endangered, title = "Endangered Data for Endangered Languages: Digitizing Print dictionaries", author = "Maxwell, Michael and Bills, Aric", booktitle = "Proceedings of the 2nd Workshop on the Use of Computational Methods in the Study of Endangered Languages", month = mar, year = "2017", address = "Honolulu", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/W17-0112", doi = "10.18653/v1/W17-0112", pages = "85--91", } Goodman, Michael Wayne, Ryan Georgi & Fei Xia. 2018. PDF-to-Text Reanalysis for Linguistic Data Mining. The Eleventh International Conference on Language Resources and Evaluation. 723-727. Available online at: http://www.lrec-conf.org/proceedings/lrec2018/index.html @inproceedings{goodman-etal-2018-pdf, title = "{PDF}-to-Text Reanalysis for Linguistic Data Mining", author = "Goodman, Michael Wayne and Georgi, Ryan and Xia, Fei", booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", month = may, year = "2018", address = "Miyazaki, Japan", publisher = "European Language Resources Association (ELRA)", url = "https://aclanthology.org/L18-1116", } Försöker hitta IGT med image analysis A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text Dana Dann´ells Spr°akbanken Text University of Gothenburg Shafqat Mumtaz Virk Spr°akbanken Text University of Gothenburg An Unsupervised method for OCR Post-Correction and Spelling Normalisation for Finnish Quan Duong Mika Hämäläinen, Simon Hengchen, TaxoFinder: A Graph-based Approach for Taxonomy Learning Yong-Bin Kang, Pari Delir Haghigh, and Frada Burstein Hi all, The open version of the DReaM English corpus is now available as a downloadable resource on the SBX webpage. The link is : https://spraakbanken.gu.se/en/resources/dream We can use this link in our future publications if we want a reference to a downloadable version. Data Readiness for Natural Language Processing Fredrik Olsson RISE Sweden Magnus Sahlgren RISE Sweden https://jep-taln2020.loria.fr/ https://jep-taln2020.loria.fr/article-203/ TODO Segerer, Guillaume and Flavier, Sébastien. 2011-2016 RefLex: Reference Lexicon of Africa, Version 1.1. Paris, Lyon. http://reflex.cnrs.fr/ Borin, Lars, Shafqat Mumtaz Virk & Anju Saxena. 2017. Language technology for digital linguistics: turning the Linguistic Survey of India into a rich source of linguistic information. Paper presented at the 18th International Conference on Computational Linguistics and Intelligent Text Processing (CICLing), April 17 to 23, 2017, Budapest, Hungary. World Language Description Heritage: Worldcat, Google Books, Crowd-sourcing, Crowd-enumerating and Bibliography Parsing Anonymous Submission Category: Long @inproceedings{bonnet-segerer-2020-generateur, title = "G{\'e}n{\'e}rateur de dictionnaires au format Android pour les langues peu dot{\'e}es (Dictionary App Generator for Less Resourced Languages)", author = "Bonnet, R{\'e}my and Segerer, Guillaume", booktitle = "Actes de la 6e conf{\'e}rence conjointe Journ{\'e}es d'{\'E}tudes sur la Parole (JEP, 33e {\'e}dition), Traitement Automatique des Langues Naturelles (TALN, 27e {\'e}dition), Rencontre des {\'E}tudiants Chercheurs en Informatique pour le Traitement Automatique des Langues (R{\'E}CITAL, 22e {\'e}dition). Volume 4 : D{\'e}monstrations et r{\'e}sum{\'e}s d'articles internationaux", month = "6", year = "2020", address = "Nancy, France", publisher = "ATALA et AFCP", url = "https://aclanthology.org/2020.jeptalnrecital-demos.2", pages = "6--9", abstract = "Nous pr{\'e}sentons un outil informatique en source libre permettant, {\`a} partir d{'}un dictionnaire sous format {\'e}lectronique de g{\'e}n{\'e}rer une application Android. L{'}objectif est de tirer profit de l{'}augmentation spectaculaire des utilisateurs de mobiles multifonctions dans de nombreuses r{\'e}gions du monde, notamment en Afrique, pour permettre aux locuteurs de langues moins dot{\'e}es de b{\'e}n{\'e}ficier de dictionnaires de bonne qualit{\'e}, faciles {\`a} utiliser et de s{'}impliquer dans la collecte de donn{\'e}es. Cet outil est con{\c{c}}u pour {\^e}tre compatible avec les principaux formats lexicographiques.", language = "French", } @incollection{ocr:DannellsVirk:Post-OCR, author = {Dana Dannélls and Shafqat Virk}, editor = {Simon Dobnik and Richard Johansson and Peter Ljunglöf}, title = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text}, booktitle = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020}, publisher = {Linköping: Linköping Electronic Press}, year = 2021, pages = {13-20} } @incollection{cl:Wichmann:Linguistic-Terms, author = {Søren Wichmann}, editor = {Simon Dobnik and Richard Johansson and Peter Ljunglöf}, title = {Pipeline for a Data-driven Network of Linguistic Terms}, booktitle = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020}, publisher = {Linköping: Linköping Electronic Press}, year = 2021, pages = {66-71} } @incollection{cl:Hammarstrom:Term-Spotting, author = {Harald Hammarstr\"om and One-Soon Her and Marc Tang}, editor = {Simon Dobnik and Richard Johansson and Peter Ljunglöf}, title = {Term-Spotting: A quick-and-dirty method for extracting typological features of language from grammatical descriptions}, booktitle = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020}, publisher = {Linköping: Linköping Electronic Press}, year = 2021, pages = {27-34} } @MastersThesis{dream:Foster, author = {Daniel Foster}, title = {Automatic Frame-Semantic Parsing for Linguistic Descriptions: Extracting typological linguistic information from unstructured text}, school = {University of Gothenburg}, year = 2019 } @MastersThesis{dream:Aslam , author = {Aslam, Muhammad Irfan}, title = {Semantic frame based automatic extraction of typological information from descriptive grammars}, school = {University of Skövde}, year = 2019, url = {http://his.diva-portal.org/smash/record.jsf?pid=diva2%3A1371627&dswid=-9783} } @incollection{dream:Virk:Frame-Extraction, author = {Shafqat Mumtaz Virk and Azam Sheikh Muhammad and Lars Borin and Muhammad Irfan Aslam and Saania Iqbal and Nazia Khurram}, title = {Exploiting Frame-Semantics and Frame-Semantic Parsing for Automatic Extraction of Typological Information from Descriptive Grammars of Natural Languages}, publisher = {Varna, Bulgaria: NCOMA Ltd}, pages = {1247–1256}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)}, year = 2019 } @incollection{dream:Virk:LingFN, author = {Shafqat Virk and Per Malm and Lars Borin and Anju Saxena}, title = {LingFN: A FrameNet for the Linguistics Domain}, booktitle = { Computational Linguistics and Intelligent Text Processing}, editor = {Gelbukh, Alexander}, volume = {13451}, series = {Lecture Notes in Computer Science}, publisher = {Cham: Springer}, pages = {367-379}, year = 2023 } @incollection{MALM18.10, author = {Per Malm and Shafqat Mumtaz Virk and Lars Borin and Anju Saxena}, title = {LingFN: Towards a Domain-specific Linguistic FrameNet}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) Workshop 5 --- International FrameNet Workshop 2018: Multilingual FrameNets and Constructicons}, year = {2018}, month = {may}, date = {7-12}, pages = {37-43}, location = {Miyazaki, Japan}, editor = {Tiago Timponi Torrent and Lars Borin and Collin F. Baker}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-04-7}, language = {english} } @Misc{dream:WichmannRama, author = {S\o{}ren Wichmann and Taraka Rama}, title = {Towards unsupervised extraction of linguistic typological features from language descriptions}, howpublished = {First Workshop on Typology for Polyglot NLP, Florence, Aug. 1, 2019 (Co-located with ACL, July 28-Aug. 2, 2019)}, year = 2019 } @Misc{dream:Wichmann:UFMG, author = {S\o{}ren Wichmann and Harald Hammarström and Shafqat Virk}, title = {Information extraction of linguistic typological information from grammatical descriptions}, howpublished = {Presentation at the Universidade Federal de Minas Gerais, 4 Nov 2019}, year = 2019 } @Misc{dream:Hammarstrom:State-Description, author = {Harald Hammarström}, title = {The State of Description of the World's Languages}, howpublished = {Presentation at the University of Helsinki, 15 Jan 2020}, year = 2020 } @Misc{dream:Wichmann:Dream, author = {S\o{}ren Wichmann}, title = {The DReaM Project: A dictionary/grammar reading machine}, howpublished = {Presentation at the Kazan University philological faculty}, year = 2017 } @Misc{dream:Hammarstrom:Extensa, author = {Harald Hammarstr\"om}, title = {¿Cuál es la gramática mas extensa? Ideas computacionales para medir la cantidad de una descripción gramatical de una lengua}, howpublished = {Presentation at the Pontificia Universidad Cat\'olica de Per\'u, 9 May 2019, Lima}, year = 2019 } @Misc{dream:Virk:LingFN-CLT, author = {Shafqat Virk and Lars Borin and Per Malm and Anju Saxena and Markus Forsberg and Harald Hammarstr\"om and M. Azam and M. Irfan}, title = {LingFN: a FrameNet for the Linguistics Domain}, howpublished = {Presentation at the CLT Retreat, 8 May 2019}, year = 2019 } @Misc{dream:Hammarstrom:Text-Mining, author = {Harald Hammarstr\"om and Shafqat Virk and Markus Forsberg}, title = {Text Mining on Grammatical Descriptions of the Languages of the World}, howpublished = {Presentation at the Infrastructural Tensions workshop, Uppsala, 29-30 Aug 2019}, year = 2019 } @Misc{Hammarstrom:Grammar-Grammars, author = {Harald Hammarstr\"om and Shafqat Virk and Markus Forsberg}, title = {Extracting Grammar from Grammars: From Raw-Text Descriptions to Grammatical Characteristics of the Languages of the World}, howpublished = {Presentation at the Computational Linguistics Seminar, Uppsala}, year = 2017 } @Misc{Hammarstrom:Waiheke2017, author = {Harald Hammarstr\"om and Shafqat Virk and Markus Forsberg}, title = {Automatically Filling in Grambank}, howpublished = {Presentation at the Glottobank meeting, Waiheke}, year = 2017 } @Misc{Macklin-Cordes, author = {Jayden L. Macklin-Cordes and Nathaniel L. Blackbourne and Thomas J. Bott and Jacqueline Cook and T. Mark Ellison and Jordan Hollis and Edith E. Kirlew and Genevieve C. Richards and Sanle Zhao and Erich R. Round}, title = {Robots who read grammars}, howpublished = {Poster presented at CoEDL Fest 2017, Alexandra Park Conference Centre, Alexandra Headlands, QLD}, year = 2017 } @book{el:Tsunoda:LELR, author = {Tasaku Tsunoda}, title = {Language Endangerment and Language Revitalization}, publisher = {Berlin: Mouton de Gruyter}, series = {Trends in Linguistics: Studies and Monographs}, volume = {148}, pages = {307}, year = {2005}, gbid = {t3MphHy-2g0C}, hhtype = {socling}, inlg = {English [eng]}, isbn = {9783110184297}, oclc = {56614349} } @article{typ:Plank:WALS, author = {Frank Plank}, title = {{WALS} values evaluated}, journal = {Linguistic Typology}, volume = {13}, number = {1}, pages = {41-75}, year = {2009}, hhtype = {specific_feature}, inlg = {English [eng]} } @article{typ:Polyakovetal:WALS-JM, author = {Vladimir N. Polyakov and Valery D. Solovyev and S\o{}ren Wichmann and Oleg Belyaev}, title = {Using WALS and Jazyki Mira}, journal = {Linguistic Typology}, volume = {13}, pages = {137-167}, year = {2009}, glottolog_ref_id = {153004}, hhtype = {specific_feature}, inlg = {English [eng]} } @inproceedings{W17-0119, author = "Littell, Patrick and Pine, Aidan and Davis, Henry", title = "Waldayu and Waldayu Mobile: Modern digital dictionary interfaces for endangered languages", booktile = {Proceedings of the 2nd Workshop on the Use of Computational Methods in the Study of Endangered Languages}, year = "2017", publisher = "Association for Computational Linguistics", pages = "141--150", location = "Honolulu", url = "http://www.aclweb.org/anthology/W17-0119" } @InProceedings{cl:Kamholz:PanLex, author = {David Kamholz and Jonathan Pool and Susan Colowick}, title = {PanLex: Building a Resource for Panlingual Lexical Translation}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, year = {2014}, month = {may}, date = {26-31}, address = {Reykjavik, Iceland}, editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-8-4}, language = {english} } @inproceedings{lkl:Bender:From-IGT, author = {Bender, Emily M. and Crowgey, Joshua and Goodman, Michael Wayne and Xia, Fei}, title = {Learning Grammar Specifications from IGT: A Case Study of Chintang}, booktitle = {Proceedings of the 2014 Workshop on the Use of Computational Methods in the Study of Endangered Languages}, publisher = {Association for Computational Linguistics}, address = {Baltimore, Maryland, USA}, pages = {43--53}, year = {2014}, url = {http://www.aclweb.org/anthology/W/W14/W14-2206}, month = {June} } @incollection{ling:Bickel:Distributional-Typology, author = {Bickel, Balthasar}, editor = {Bernd Heine and Heiko Narrog}, title = {Distributional typology: statistical inquiries into the dynamics of linguistic diversity}, booktitle = {The Oxford Handbook of Linguistic Analysis}, publisher = {Oxford: Oxford University Press}, pages = {901-923}, year = {2015}, edition = {2} } @book{ir:Manning:IR, author = {Christopher D. Manning and Prabhakar Raghavan and Hinrich Schütze}, title = {Introduction to Information Retrieval}, publisher = {Cambridge: Cambridge University Press}, year = {2008} } @misc{ling:Cysouw:Typology-Types, author = {Cysouw, Michael}, title = {Typology without Types: Quantitatively inducing a Numeral Typology}, year = {2011}, url = {http://cysouw.de/home/presentations_files/cysouwALT9numerals.pdf}, howpublished = {Poster presented at the 9th biannual meeting of the Association for Linguistic Typology, ALT9, Hong Kong, China} } @misc{ling:Cooper:Warehouse, author = {Doug Cooper}, title = {Logistics of the Asia-Pacific Linguistic Data Warehouse}, year = {2014}, howpublished = {Paper presented at the Language Comparison with Linguistic Databases: RefLex and Typological Databases, 7-8 Oct 2014} } @incollection{ling:Dryer:Descriptive, author = {Dryer, Matthew S.}, editor = {Felix Ameka and Alan Dench and Nicholas Evans}, title = {Descriptive theories, explanatory theories, and basic linguistic theory}, booktitle = {Catching Language: Issues in Grammar Writing}, publisher = {Berlin: Mouton de Gruyter}, pages = {207-234}, year = {2006} } @incollection{hv:Guldemann:Africa:Macro-Areas, author = {Güldemann, Tom}, editor = {Lameli, Alfred and Kehrein, Roland and Rabanus, Stefan}, title = {"Sprachraum" and geography: Linguistic macro-areas in Africa}, booktitle = {Language and Space: An International Handbook of Linguistic Variation Volume 2: Language Mapping}, publisher = {Berlin: Mouton de Gruyter}, series = {Handbooks of Linguistics and Communication Science}, volume = {30/2}, pages = {561-585}, year = {2010}, fn = {africa\guldemann_sprachraum2010.pdf, africa\guldemann_sprachraum-africa2010.zip}, glottolog_ref_id = {22213}, hhtype = {overview;comparative}, inlg = {English [eng]}, macro_area = {Africa} } @incollection{ocr:Hammarstrom, author = {Harald Hammarström and Shafqat Mumtaz Virk and Markus Forsberg}, title = {Poor Man's {OCR} Post-Correction: Unsupervised Recognition of Variant Spelling Applied to a Multilingual Document Collection}, booktitle = {Proceedings of the Digital Access to Textual Cultural Heritage (DATeCH) conference}, publisher = {Göttingen: ACM}, pages = {71-75}, year = {2017} } @misc{typ:Hammarstrom:Three-Approaches, author = {Harald Hammarström}, title = {Three Approaches to Prefix and Suffix Statistics in the Languages of the World}, year = {2013}, howpublished = {Paper presented at the Workshop on Corpus-based Quantitative Typology (CoQuaT 2013)} } @book{ling:Harris:Structural, author = {Harris, Zellig S.}, title = {Methods in structural linguistics}, publisher = {Chicago: University of Chicago Press}, pages = {xv+384}, year = {1951} } @book{ling:Dryer:Word-Order, author = {Matthew Dryer}, title = {World Atlas of Word Order in Language}, publisher = {Oxford: Oxford University Press}, year = {forthcoming} } @article{typ:EvansLevinson:Universals, author = {Nicholas Evans and Stephen Levinson}, title = {The Myth of Language Universals: Language diversity and its importance for cognitive science}, journal = {Behavioral and Brain Sciences}, volume = {32}, number = {5}, pages = {429-492}, year = {2009}, glottolog_ref_id = {40416}, hhtype = {specific_feature}, inlg = {English [eng]} } @article{ling:Himmelmann:Suffixing:2014, author = {Nikolaus Himmelmann}, title = {Asymmetries in the prosodic phrasing of function words: Another look at the suffixing preference}, journal = {Language}, volume = {90}, number = {4}, pages = {927-960}, year = {2014} } @incollection{cl:Virk:Automatic-Extraction, author = {Virk, Shafqat Mumtaz and Lars Borin and Anju Saxena and Harald Hammarstr\"om}, editor = {Kamil Ek\v{s}tein and V\'aclav Matou\v{s}ek}, title = {Automatic Extraction of Typological Linguistic Features from Descriptive Grammars}, booktitle = {Text, Speech, and Dialogue: 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings}, publisher = {Berlin: Springer}, year = 2017, series = {Lecture Notes in Computer Science}, volume = 10415, pages = {111-119} } @inProceedings{Borin-Lars2016-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, adress = {Paris}, pages = {87-92} } @misc{cl:Nivre:UD20, author = {Nivre, Joakim and Agi\'c, \v{Z}eljko and Ahrenberg, Lars and Aranzabe, Maria Jesus}, title = {Universal Dependencies 2.0}, year = {2017}, url = {http://hdl.handle.net/11234/1-1983}, howpublished = {{LINDAT}/{CLARIN} digital library at the Institute of Formal and Applied Linguistics, Charles University in Prague} } @article{v:Segerer:RefLex, author = {Segerer, Guillaume}, title = {RefLex: la reconstruction sans peine}, journal = {Faits de Langues}, volume = {47}, pages = {201-214}, year = {2016}, fn = {africa\segerer_reflex2016.pdf}, glottolog_ref_id = {552072}, hhtype = {comparative}, macro_area = {Africa} } @misc{cl:Virk:TextCat, author = {Shafqat Virk and Markus Forsberg and Harald Hammarström}, title = {TextCat for Language Profiling}, year = {2017}, howpublished = {Submitted} } @Article{cl:Xia:ODIN, author = {Fei Xia and William D. Lewis and Michael Wayne Goodman and Glenn Slayden and Ryan Georgi and Joshua Crowgey and Emily M. Bender}, title = {Enriching a massively multilingual database of interlinear glossed text}, journal = {Language Resources and Evaluation}, year = 2016, volume = 50, number = 2, pages = {1-29} } @incollection{cl:Mikolov:Words-Phrases, author = {Tomas Mikolov and Ilya Sutskever and Kai Chen and Gregory S. Corrado and Jeffrey Dean}, editor = {Christopher J. C. Burges and L{é}on Bottou and Zoubin Ghahramani and Kilian Q. Weinberger}, title = {Distributed Representations of Words and Phrases and their Compositionality}, booktitle = {Advances in Neural Information Processing Systems 26 (NIPS 2013)}, publisher = {Neural Information Processing Systems}, address = {Lake Tahoe, Nevada}, pages = {3111-3119}, year = {2013}, url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality}, bibsource = {dblp computer science bibliography, http://dblp.org}, biburl = {http://dblp.uni-trier.de/rec/bib/conf/nips/MikolovSCCD13}, timestamp = {Thu, 07 May 2015 20:02:01 +0200} } @book{ling:deSaussure:Generale, author = {de Saussure, Ferdinand}, title = {Cours de linguistique générale}, publisher = {Paris: Payot}, year = {1916} } @misc{Hammarstrom:Gender, author = {Harald Hammarstr\"om and One-Soon Her and Marc Tang and Olof Lundgren and Hilda Appelgren and William Zetterberg}, title = {Automatically building a database of gender/noun class/classifiers from digitized grammatical descriptions}, howpublished = {Paper presented at the Lund Gender Workshop, 12 Mar 2021, Lund University}, year = 2021 } @misc{cl:Hammarstrom:Bibliographical-Parsing, author = {Harald Hammarstr\"om}, title = {Bibliographical Parsing of Descriptive Linguistic Literature}, year = 2022, howpublished = {Submitted} } @incollection{cl:Hammarstrom:Gramfinder, author = {Harald Hammarstr{\"{o}}m}, editor = {Eduard C. Dragut and Yunyao Li and Lucian Popa and Slobodan Vucetic}, title = {Gramfinder: Human and Machine Reading of Grammatical Descriptions of the Languages of the World}, booktitle = {3rd Workshop on Data Science with Human in the Loop, DaSH@KDD, Virtual Conference, August 15, 2021}, year = {2021}, publisher = {DBLP}, pages = {1-6}, vrcat = {2. Peer-reviewed conference contributions}, url = {https://drive.google.com/file/d/1AOLVbevwUg1S1gK1CXGG8HXKXoBBZrBa/view}, timestamp = {Fri, 14 Jan 2022 16:01:00 +0100}, biburl = {https://dblp.org/rec/conf/kdd/Hammarstrom21.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } @incollection{cl:Hammarstrom:Content-Separation, author = {Harald Hammarstr\"om}, title = {Inventory and Content Separation in Grammatical Descriptions of Languages of the World}, booktitle = {Linking Theory and Practice of Digital Libraries: 25th International Conference on Theory and Practice of Digital Libraries, TPDL 2021, Virtual Event, September 13-17, 2021, Proceedings}, publisher = {Berlin: Springer}, editor = {Gerd Berget and Mark Michael Hall and Daniel Brenn and Sanna Kumpulainen}, year = 2021, pages = {129-140} } @incollection{cl:Hammarstrom:Prefix-Suffix, author = {Harald Hammarstr\"om}, title = {Measuring Prefixation and Suffixation in the Languages of the World}, booktitle = {Proceedings of The 3rd Workshop on Research in Computational Typology and Multilingual NLP}, publisher = {Stroudsburg, PA: Association for Computational Linguistics (ACL)}, year = 2021, pages = {81-89} } @misc{cl:Hammarstrom:Good-Grammar, author = {Harald Hammarstr\"om}, title = {How good is this grammar? Term-counting techniques for measuring the comprehensiveness of grammatical description}, year = 2022, howpublished = {Submitted} } @Article{cl:Zariquiey:Terms-Endangerment, author = {Roberto Zariquiey and M\'onica Arakaki and Javier Vera and Guido Torres and Claret Cuba and Carlos Barrientos and Aracelli Garc\'ia and Adriano Ingunza and Harald Hammarstr\"om}, title = {Linking endangerment databases and descriptive linguistics: an assessment of the use of terms relating to language endangerment in grammars}, journal = {Language Documentation \& Conservation}, volume = {16}, vrcat = {1. Peer-reviewed original articles}, year = {2022}, pages = {290-318} } @article{cl:Tang:WACL, author = {One-Soon Her and Harald Hammarstr\"om and Marc Allassonni\`ere-Tang}, title = {Defining numeral classifiers and identifying classifier languages of the world}, journal = {Linguistics Vanguard}, url = {https://doi.org/10.1515/lingvan-2022-0006}, volume = {8}, number = 6, vrcat = {1. Peer-reviewed original articles}, pages = {1-14}, year = {2022} } @incollection{virk-EtAl:2020:LREC, author = {Virk, Shafqat Mumtaz and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren}, title = {The {DReaM} Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, month = {May}, year = {2020}, address = {Marseille, France}, publisher = {Marseille, France: European Language Resources Association}, pages = {871--877}, abstract = {There exist as many as 7000 natural languages in the world, and a huge number of documents describing those languages have been produced over the years. Most of those documents are in paper format. Any attempts to use modern computational techniques and tools to process those documents will require them to be digitized first. In this paper, we report a multilingual digitized version of thousands of such documents searchable through some well-established corpus infrastructures. The corpus is annotated with various meta, word, and text level attributes to make searching and analysis easier and more useful.}, url = {https://www.aclweb.org/anthology/2020.lrec-1.109} } @InCollection{cl:Virk:Profiles, author = {Shafqat Mumtaz Virk and Harald Hammarstr\"om and Lars Borin and Markus Forsberg and S\o{}ren Wichmann}, title = {From Linguistic Descriptions to Language Profiles}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020)}, publisher = {Marseille: European Language Resources Association (ELRA)}, year = 2020, pages = {23-27} } @Article{typ:Robbers:Goal-Source, author = {Maja Robbers}, title = {A variety-driven approach to Goal-Source (a)symmetries}, journal = {Submitted}, pages = {60}, year = 2023 } @Article{typ:Allasionniere:Nominal-Categorization, author = {Marc Allassonnière-Tang and Olof Lundgren and Maja Robbers and Sandra Cronhamn and Filip Larsson and One-Soon Her and Harald Hammarström and Gerd Carling}, title = {Expansion by migration and diffusion by contact is a source to the global diversity of linguistic nominal categorization systems}, journal = {Nature: Humanities and Social Sciences Communications}, url = {https://doi.org/10.1057/s41599-021-01003-5}, volume = 8, number = 331, pages = {1-6, 1-50}, year = 2021 } @inproceedings{virk-etal-2021-deep, title = "A Deep Learning System for Automatic Extraction of Typological Linguistic Information from Descriptive Grammars", author = "Virk, Shafqat Mumtaz and Foster, Daniel and Sheikh Muhammad, Azam and Saleem, Raheela", booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)", month = sep, year = "2021", address = "Held Online", publisher = "INCOMA Ltd.", url = "https://aclanthology.org/2021.ranlp-main.166", pages = "1480--1489", abstract = "Linguistic typology is an area of linguistics concerned with analysis of and comparison between natural languages of the world based on their certain linguistic features. For that purpose, historically, the area has relied on manual extraction of linguistic feature values from textural descriptions of languages. This makes it a laborious and time expensive task and is also bound by human brain capacity. In this study, we present a deep learning system for the task of automatic extraction of linguistic features from textual descriptions of natural languages. First, textual descriptions are manually annotated with special structures called semantic frames. Those annotations are learned by a recurrent neural network, which is then used to annotate un-annotated text. Finally, the annotations are converted to linguistic feature values using a separate rule based module. Word embeddings, learned from general purpose text, are used as a major source of knowledge by the recurrent neural network. We compare the proposed deep learning system to a previously reported machine learning based system for the same task, and the deep learning system wins in terms of F1 scores with a fair margin. Such a system is expected to be a useful contribution for the automatic curation of typological databases, which otherwise are manually developed.", } @inproceedings{virk-etal-2021-data, title = "A Data-Driven Semi-Automatic Framenet Development Methodology", author = "Virk, Shafqat Mumtaz and Dann{\'e}lls, Dana and Borin, Lars and Forsberg, Markus", booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)", month = sep, year = "2021", address = "Held Online", publisher = "INCOMA Ltd.", url = "https://aclanthology.org/2021.ranlp-main.165", pages = "1471--1479", abstract = "FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database and together with the annotated example sentences they have been made available through a web interface.", }