{"id":36276,"date":"2025-02-10T10:51:49","date_gmt":"2025-02-10T09:51:49","guid":{"rendered":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/"},"modified":"2025-02-10T11:08:03","modified_gmt":"2025-02-10T10:08:03","slug":"oai-clarino-uib-no-slv-ssj500k-dep","status":"publish","type":"language-resource","link":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/","title":{"rendered":"Tranining corpus ssj500kv1.2"},"content":{"rendered":"<p><?xml version='1.0' encoding='utf-8'?><br \/>\n<record><\/p>\n<header><identifier>oai:clarino.uib.no:slv-ssj500k-dep<\/identifier><datestamp>2016-10-14T22:28:18Z<\/datestamp><setSpec>INESS<\/setSpec><\/header>\n<p><metadata><CMD xmlns=\"http:\/\/www.clarin.eu\/cmd\/\"><br \/>\n<Header><br \/>\n<MdCreator>Carla Parra Escart\u00c3\u00adn<\/MdCreator><br \/>\n<MdCreationDate>2015-02-10<\/MdCreationDate><br \/>\n<MdSelfLink>http:\/\/hdl.handle.net\/11495\/D8A2-CFB1-49F7-1<\/MdSelfLink><br \/>\n<MdProfile>clarin.eu:cr1:p_1407745711925<\/MdProfile><br \/>\n<MdCollectionDisplayName>Clarino Bergen Centre &#8211; INESS<\/MdCollectionDisplayName><br \/>\n<\/Header><Resources><ResourceProxyList><ResourceProxy id=\"ref-landingpage\"><ResourceType mimetype=\"\">LandingPage<\/ResourceType><ResourceRef>hdl:11495\/DB26-0437-026E-4<\/ResourceRef><\/ResourceProxy><br \/>\n<\/ResourceProxyList><JournalFileProxyList \/><br \/>\n<ResourceRelationList \/><br \/>\n<IsPartOfList \/><\/Resources><br \/>\n<Components><br \/>\n<corpusProfile><br \/>\n    <resourceCommonInfo ComponentId=\"clarin.eu:cr1:c_1396012485126\"><br \/>\n        <resourceType>corpus<\/resourceType><br \/>\n        <identificationInfo ComponentId=\"clarin.eu:cr1:c_1396012485125\"><br \/>\n            <resourceName xml:lang=\"en\">Tranining corpus ssj500kv1.2<\/resourceName><br \/>\n            <description xml:lang=\"en\">The ssj500k training corpus is based on two training corpora, built within the JOS project. It contains the entire jos100k corpus and additional 400.000 words from a million-word jos1M corpus. When making the training corpus, the text, consisting of a sequence of characters (letters, numbers, spaces, symbols etc.), has to be first divided into meaningful units such as paragraphs, sentences, words and punctuation. This procedure is called segmentation (sentence identification) and tokenization (identification of tokens, i.e. words and punctuation). Two other types of information are attributed to each word: a basic form or a lemma (jagodam, jagodami -&gt; jagoda) and a morphosyntactic tag. The latter is formed as an acronym, containing the information on word class and related morphosyntactic features, for example Somei = samostalnik (noun), ob\u010dno ime (common noun), mo\u0161ki spol (masculine gender), ednina (singular), imenovalnik (nominative). The ssj500k corpus uses the JOS tagset that contains exactly 1,902 tags with combinations of categories and features according to the specifications of the JOS project.<\/description><br \/>\n            <url description=\"landing page @ INESS\">http:\/\/clarino.uib.no\/iness\/landing-page?resource=slv-ssj500k-dep&amp;view=short<\/url><br \/>\n            <url>http:\/\/eng.slovenscina.eu\/tehnologije\/ucni-korpus<\/url><br \/>\n            <PID description=\"landing page @ INESS\">hdl:11495\/DB26-0437-026E-4<\/PID><br \/>\n            <identifier>slv-ssj500k-dep<\/identifier><br \/>\n        <\/identificationInfo><br \/>\n        <distributionInfo ComponentId=\"clarin.eu:cr1:c_1396012485124\">\n            <licenceInfo ComponentId=\"clarin.eu:cr1:c_1396012485158\">\n                <userCategory>Public<\/userCategory><br \/>\n                <distributionAccessMedium>accessibleThroughInterface<\/distributionAccessMedium><br \/>\n                <executionLocation>http:\/\/hdl.handle.net\/11495\/DB26-0437-026E-4<\/executionLocation><br \/>\n                <attributionText xml:lang=\"en\">Krek, Simon and Erjavec, Toma\u017e (2014). Training corpus ssj500kv1.2. Jo\u017eef Stefan Institute, Slovenia. http:\/\/hdl.handle.net\/11495\/DB26-0437-026E-4<\/attributionText>\n                <licence ComponentId=\"clarin.eu:cr1:c_1447674760330\">\n                    <licenceFamily>Creative Commons (CC)<\/licenceFamily>\n                    <licenceName>Creative_Commons-BY-NC-SA (CC-BY-NC-SA)<\/licenceName>\n                    <licenceURL>https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/<\/licenceURL>\n                    <conditionsOfUse>BY<\/conditionsOfUse><br \/>\n                    <conditionsOfUse>NC<\/conditionsOfUse><br \/>\n                    <conditionsOfUse>SA<\/conditionsOfUse>\n                <\/licence>\n                <licensor>\n                    <actorInfo ComponentId=\"clarin.eu:cr1:c_1396012485194\"><br \/>\n                        <actorType>organization<\/actorType><br \/>\n                        <role>iprHolder<\/role><br \/>\n                        <organizationInfo ComponentId=\"clarin.eu:cr1:c_1407745711883\"><br \/>\n                            <organizationName xml:lang=\"en\">Slovenian Ministry of Education, Science and Sport<\/organizationName><br \/>\n                        <\/organizationInfo><br \/>\n                    <\/actorInfo>\n                <\/licensor>\n            <\/licenceInfo>\n        <\/distributionInfo><br \/>\n        <contact><br \/>\n            <actorInfo ComponentId=\"clarin.eu:cr1:c_1396012485194\"><br \/>\n                <actorType>person<\/actorType><br \/>\n                <role>author<\/role>\n                <personInfo ComponentId=\"clarin.eu:cr1:c_1396012485192\">\n                    <surname>Krek<\/surname><br \/>\n                    <givenName>Simon<\/givenName><br \/>\n                    <affiliation><br \/>\n                        <organizationInfo ComponentId=\"clarin.eu:cr1:c_1407745711883\"><br \/>\n                            <organizationName xml:lang=\"en\">\u201cJo\u017eef Stefan\u201d Institute<\/organizationName><br \/>\n                        <\/organizationInfo><br \/>\n                    <\/affiliation>\n                <\/personInfo>\n            <\/actorInfo><br \/>\n        <\/contact><br \/>\n        <metadataInfo ComponentId=\"clarin.eu:cr1:c_1407745711922\"><br \/>\n            <metadataCreationDate>2015-02-10<\/metadataCreationDate><br \/>\n            <metadataLastDateUpdated>2016-10-14<\/metadataLastDateUpdated><br \/>\n            <metadataCreator><br \/>\n                <actorInfo ComponentId=\"clarin.eu:cr1:c_1396012485194\"><br \/>\n                    <actorType>person<\/actorType><br \/>\n                    <role>metadataCreator<\/role>\n                    <personInfo ComponentId=\"clarin.eu:cr1:c_1396012485192\">\n                        <surname>Parra Escart\u00edn<\/surname><br \/>\n                        <givenName>Carla<\/givenName><br \/>\n                        <affiliation><br \/>\n                            <organizationInfo ComponentId=\"clarin.eu:cr1:c_1407745711883\"><br \/>\n                                <organizationName xml:lang=\"en\">University of Bergen<\/organizationName><br \/>\n                                <organizationShortName xml:lang=\"en\">UiB<\/organizationShortName><br \/>\n                                <departmentName xml:lang=\"en\">Department of Linguistic, Literary and Aesthetic Studies<\/departmentName><br \/>\n                            <\/organizationInfo><br \/>\n                        <\/affiliation>\n                    <\/personInfo>\n                <\/actorInfo><br \/>\n            <\/metadataCreator><br \/>\n        <\/metadataInfo><br \/>\n        <versionInfo ComponentId=\"clarin.eu:cr1:c_1430905751648\"><br \/>\n            <version>Version 1.2 of the ssj500k training corpus with the category &laquo;organisation&raquo; added to the Named Entity annotation level.<\/version><br \/>\n        <\/versionInfo><br \/>\n    <\/resourceCommonInfo><br \/>\n    <corpusInfo ComponentId=\"clarin.eu:cr1:c_1407745711878\"><br \/>\n        <corpusType>Treebank<\/corpusType><br \/>\n        <corpusPartInfo ComponentId=\"clarin.eu:cr1:c_1407745711885\"><br \/>\n            <mediaType>text<\/mediaType><br \/>\n        <\/corpusPartInfo><br \/>\n        <corpusPartGeneralInfo ComponentId=\"clarin.eu:cr1:c_1407745711882\">\n            <lingualityInfo ComponentId=\"clarin.eu:cr1:c_1355150532313\">\n                <lingualityType>monolingual<\/lingualityType>\n            <\/lingualityInfo>\n            <languageInfo ComponentId=\"clarin.eu:cr1:c_1428388179423\"><br \/>\n                <languageId>sl<\/languageId><br \/>\n                <languageName>Slovenian<\/languageName><br \/>\n            <\/languageInfo><br \/>\n        <\/corpusPartGeneralInfo><br \/>\n    <\/corpusInfo><br \/>\n<\/corpusProfile><\/Components><\/CMD><\/metadata><\/record><\/p>\n","protected":false},"template":"","categories":[],"tags":[],"language-resource-type":[7569],"language-resource-origin":[7558],"class_list":["post-36276","language-resource","type-language-resource","status-publish","hentry"],"yoast_head":"<!-- This site is optimized with the Yoast SEO Premium plugin v27.1 (Yoast SEO v27.1.1) - https:\/\/yoast.com\/product\/yoast-seo-premium-wordpress\/ -->\n<title>Tranining corpus ssj500kv1.2 - Spr\u00e5kbanken<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/\" \/>\n<meta property=\"og:locale\" content=\"nb_NO\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Tranining corpus ssj500kv1.2\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/\" \/>\n<meta property=\"og:site_name\" content=\"Spr\u00e5kbanken\" \/>\n<meta property=\"article:modified_time\" content=\"2025-02-10T10:08:03+00:00\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Ansl. lesetid\" \/>\n\t<meta name=\"twitter:data1\" content=\"2 minutter\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/\",\"url\":\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/\",\"name\":\"Tranining corpus ssj500kv1.2 - Spr\u00e5kbanken\",\"isPartOf\":{\"@id\":\"https:\/\/www.nb.no\/sprakbanken\/#website\"},\"datePublished\":\"2025-02-10T09:51:49+00:00\",\"dateModified\":\"2025-02-10T10:08:03+00:00\",\"breadcrumb\":{\"@id\":\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/#breadcrumb\"},\"inLanguage\":\"nb-NO\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/www.nb.no\/sprakbanken\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Resources from the resource bank\",\"item\":\"https:\/\/www.nb.no\/sprakbanken\/en\/resource-catalogue\/\"},{\"@type\":\"ListItem\",\"position\":3,\"name\":\"Tranining corpus ssj500kv1.2\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.nb.no\/sprakbanken\/#website\",\"url\":\"https:\/\/www.nb.no\/sprakbanken\/\",\"name\":\"Spr\u00e5kbanken\",\"description\":\"\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.nb.no\/sprakbanken\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"nb-NO\"}]}<\/script>\n<!-- \/ Yoast SEO Premium plugin. -->","yoast_head_json":{"title":"Tranining corpus ssj500kv1.2 - Spr\u00e5kbanken","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/","og_locale":"nb_NO","og_type":"article","og_title":"Tranining corpus ssj500kv1.2","og_url":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/","og_site_name":"Spr\u00e5kbanken","article_modified_time":"2025-02-10T10:08:03+00:00","twitter_card":"summary_large_image","twitter_misc":{"Ansl. lesetid":"2 minutter"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/","url":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/","name":"Tranining corpus ssj500kv1.2 - Spr\u00e5kbanken","isPartOf":{"@id":"https:\/\/www.nb.no\/sprakbanken\/#website"},"datePublished":"2025-02-10T09:51:49+00:00","dateModified":"2025-02-10T10:08:03+00:00","breadcrumb":{"@id":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/#breadcrumb"},"inLanguage":"nb-NO","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.nb.no\/sprakbanken\/ressurskatalog\/oai-clarino-uib-no-slv-ssj500k-dep\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/www.nb.no\/sprakbanken\/"},{"@type":"ListItem","position":2,"name":"Resources from the resource bank","item":"https:\/\/www.nb.no\/sprakbanken\/en\/resource-catalogue\/"},{"@type":"ListItem","position":3,"name":"Tranining corpus ssj500kv1.2"}]},{"@type":"WebSite","@id":"https:\/\/www.nb.no\/sprakbanken\/#website","url":"https:\/\/www.nb.no\/sprakbanken\/","name":"Spr\u00e5kbanken","description":"","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.nb.no\/sprakbanken\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"nb-NO"}]}},"lang":"nb","translations":{"nb":36276,"en":36279},"pll_sync_post":[],"_links":{"self":[{"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/language-resource\/36276","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/language-resource"}],"about":[{"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/types\/language-resource"}],"wp:attachment":[{"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/media?parent=36276"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/categories?post=36276"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/tags?post=36276"},{"taxonomy":"language-resource-type","embeddable":true,"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/language-resource-type?post=36276"},{"taxonomy":"language-resource-origin","embeddable":true,"href":"https:\/\/www.nb.no\/sprakbanken\/wp-json\/wp\/v2\/language-resource-origin?post=36276"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}