[nlp-infra-devel] kérés: írjatok a listára, amint vmi elkészül -- newsml korpusz, tanítás
János Zsibrita
zsibrita.janos at gmail.com
Tue Aug 2 13:51:50 CEST 2016
Sziasztok,
Mellékeltem a java kódot.
2016.08.02. 12:13 keltezéssel, Sass Bálint írta:
> Kedves Jani!
>
> Ez tök jó!
> Küldd el légyszi a java kódot, az nagyon is járható út. :)
>
> Az 'Adv' és a 'SubPos=x' között még van egy TAB, azaz 8 mező van, ugye?
> Ezek: id token lemma pos feat depTarget depType
>
> A token/lemma/pos/feat formátum a dep-re és konst-ra is vonatkozik, ugye?
>
> Kösz szépen:
> Bálint
>
> Ezt írtad ma:
>> Sziasztok,
>>
>> Egy példasor a trainből:
>>
>> 19 ismét ismét AdvSubPOS=x|Deg=none|Num=none|Per=none 20 MODE
>>
>> ahol
>> - az Adv a hfst által megadott szófaj
>> - a SubPOS=x|Deg=none|Num=none|Per=none pedig a morfológiai jegyekből
>> kinyert feature-ök
>>
>> A szófafaj kinyerése viszonylag egyszerű, 1-2 (egyelőre) hibás hfts
>> output esetet (pl. [Adj][Nom]) leszámítva.
>> A morfológiai feature-ök a mate számára pedig a CoNLL 2009 formátum
>> alapján történnek.
>>
>> Mindkettőre van nagyon megírt, egyszerű java kódom, amit el tudok
>> küldeni, amennyiben a java járható út.
>
-------------- next part --------------
package util;
import java.util.*;
/**
* Created by zsibritajanos on 2016.08.02..
*/
public class DepTool {
/**
* Default
*/
private static final String FEATURED_DEF_VALUE = "none";
/**
* NOUN
*/
private static final Map<String, String> NOUN_DEF_MAP = new LinkedHashMap<>();
static {
NOUN_DEF_MAP.put("SubPOS", "c");
NOUN_DEF_MAP.put("Num", "s");
NOUN_DEF_MAP.put("Cas", "n");
NOUN_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
NOUN_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
NOUN_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
}
/**
* VERB
*/
private static final Map<String, String> VERB_DEF_MAP = new LinkedHashMap<>();
static {
VERB_DEF_MAP.put("SubPOS", "m");
VERB_DEF_MAP.put("Mood", "i");
VERB_DEF_MAP.put("Tense", "s");
VERB_DEF_MAP.put("Per", FEATURED_DEF_VALUE);
VERB_DEF_MAP.put("Num", FEATURED_DEF_VALUE);
VERB_DEF_MAP.put("Def", FEATURED_DEF_VALUE);
}
/**
* DET
*/
private static final Map<String, String> DET_DEF_MAP = new LinkedHashMap<>();
static {
DET_DEF_MAP.put("SubPOS", "f");
}
/**
* CONJ
*/
private static final Map<String, String> CONJ_DEF_MAP = new LinkedHashMap<>();
static {
CONJ_DEF_MAP.put("SubPOS", "c");
CONJ_DEF_MAP.put("Form", "s");
CONJ_DEF_MAP.put("Coord", "w");
}
/**
* ADV
*/
private static final Map<String, String> ADV_DEF_MAP = new LinkedHashMap<>();
static {
ADV_DEF_MAP.put("SubPOS", "x");
ADV_DEF_MAP.put("Deg", FEATURED_DEF_VALUE);
ADV_DEF_MAP.put("Num", FEATURED_DEF_VALUE);
ADV_DEF_MAP.put("Per", FEATURED_DEF_VALUE);
}
/**
* ADJ
*/
private static final Map<String, String> ADJ_DEF_MAP = new LinkedHashMap<>();
static {
ADJ_DEF_MAP.put("SubPOS", "f");
ADJ_DEF_MAP.put("Deg", "p");
ADJ_DEF_MAP.put("Num", "s");
ADJ_DEF_MAP.put("Cas", "n");
ADJ_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
ADJ_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
ADJ_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
}
/**
* NUM
*/
private static final Map<String, String> NUM_DEF_MAP = new LinkedHashMap<>();
static {
NUM_DEF_MAP.put("SubPOS", "c");
NUM_DEF_MAP.put("Num", "s");
NUM_DEF_MAP.put("Cas", "n");
NUM_DEF_MAP.put("Form", "d");
NUM_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
NUM_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
NUM_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
}
/**
* INJ
*/
private static final Map<String, String> INJ_DEF_MAP = new LinkedHashMap<>();
static {
INJ_DEF_MAP.put("SubPOS", "o");
}
/**
* POST
*/
private static final Map<String, String> POST_DEF_MAP = new LinkedHashMap<>();
static {
POST_DEF_MAP.put("SubPOS", "t");
}
/**
* NOUN
*/
private static final Map<String, List<ConllPair>> NOUN_INFRA_FEATURE_MAP = new TreeMap<>();
static {
// cas
NOUN_INFRA_FEATURE_MAP.put("[Supe]", Arrays.asList(new ConllPair("Cas", "p")));
NOUN_INFRA_FEATURE_MAP.put("[Ins]", Arrays.asList(new ConllPair("Cas", "i")));
NOUN_INFRA_FEATURE_MAP.put("[Dat]", Arrays.asList(new ConllPair("Cas", "g")));
NOUN_INFRA_FEATURE_MAP.put("[All]", Arrays.asList(new ConllPair("Cas", "t")));
NOUN_INFRA_FEATURE_MAP.put("[Nom]", Arrays.asList(new ConllPair("Cas", "n")));
NOUN_INFRA_FEATURE_MAP.put("[Acc]", Arrays.asList(new ConllPair("Cas", "a")));
NOUN_INFRA_FEATURE_MAP.put("[Subl]", Arrays.asList(new ConllPair("Cas", "s")));
NOUN_INFRA_FEATURE_MAP.put("[Ine]", Arrays.asList(new ConllPair("Cas", "2")));
NOUN_INFRA_FEATURE_MAP.put("[Ade]", Arrays.asList(new ConllPair("Cas", "3")));
NOUN_INFRA_FEATURE_MAP.put("[Pl]", Arrays.asList(new ConllPair("Num", "p")));
NOUN_INFRA_FEATURE_MAP.put("[Poss.3Sg]", Arrays.asList(new ConllPair("Num", "s"), new ConllPair("NumP", "s"), new ConllPair("PerP", "3")));
NOUN_INFRA_FEATURE_MAP.put("[Pl.Poss.3Sg]", Arrays.asList(new ConllPair("Num", "p"), new ConllPair("NumP", "s"), new ConllPair("PerP", "3")));
}
/**
* VERB
*/
private static final Map<String, List<ConllPair>> VERB_INFRA_FEATURE_MAP = new TreeMap<>();
static {
VERB_INFRA_FEATURE_MAP.put("[Pst.NDef.3Sg]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[Pst.Def.3Sg]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "y")));
VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.3Sg]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.3Pl]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[Prs.Def.3Sg]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "y")));
VERB_INFRA_FEATURE_MAP.put("[Prs.Def.3Pl]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "y")));
VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.2Pl]", Arrays.asList(new ConllPair("Tense", "s"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[Pst.NDef.3Pl]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[Pst.Def.3Pl]", Arrays.asList(new ConllPair("Tense", "s"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "y")));
VERB_INFRA_FEATURE_MAP.put("[Inf]", Arrays.asList(new ConllPair("Mood", "i"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));
VERB_INFRA_FEATURE_MAP.put("[_Caus/V]", Arrays.asList(new ConllPair("SubPOS", "s")));
VERB_INFRA_FEATURE_MAP.put("[_Mod/V]", Arrays.asList(new ConllPair("SubPOS", "o")));
VERB_INFRA_FEATURE_MAP.put("[_Freq/V]", Arrays.asList(new ConllPair("SubPOS", "f")));
}
/**
* CONJ
*/
private static final Map<String, List<ConllPair>> CONJ_INFRA_FEATURE_MAP = new TreeMap<>();
static {
//
}
/**
* ADJ
*/
private static final Map<String, List<ConllPair>> ADJ_INFRA_FEATURE_MAP = new TreeMap<>();
static {
ADJ_INFRA_FEATURE_MAP.put("[_Manner/Adv]", Arrays.asList(new ConllPair("Cas", "w")));
ADJ_INFRA_FEATURE_MAP.put("[_Comp/Adj][Nom]", Arrays.asList(new ConllPair("Deg", "c")));
ADJ_INFRA_FEATURE_MAP.put("[/Adj][Ade]", Arrays.asList(new ConllPair("Cas", "3")));
}
/**
* NUM
*/
private static final Map<String, List<ConllPair>> NUM_INFRA_FEATURE_MAP = new TreeMap<>();
static {
NUM_INFRA_FEATURE_MAP.put("[/Num|Digit]", Arrays.asList(new ConllPair("SubPOS", "f")));
NUM_INFRA_FEATURE_MAP.put("[_Ord/Adj][Nom]", Arrays.asList(new ConllPair("SubPOS", "o")));
}
/**
* INJ
*/
private static final Map<String, List<ConllPair>> INJ_INFRA_FEATURE_MAP = new TreeMap<>();
static {
//
}
/**
* POST
*/
private static final Map<String, List<ConllPair>> POST_INFRA_FEATURE_MAP = new TreeMap<>();
static {
//
}
/**
* ADV
*/
private static final Map<String, List<ConllPair>> ADV_INFRA_FEATURE_MAP = new TreeMap<>();
static {
ADV_INFRA_FEATURE_MAP.put("[/Adv|Pro]", Arrays.asList(new ConllPair("SubPOS", "d")));
}
/**
* DET
*/
private static final Map<String, List<ConllPair>> DET_INFRA_FEATURE_MAP = new TreeMap<>();
static {
//
}
/**
* CoNLL feature key-value pair.
*/
public static class ConllPair {
public final String feat;
public final String value;
public ConllPair(String feat, String value) {
this.feat = feat;
this.value = value;
}
}
/**
* Map to CoNLL feature String.
*
* @param feats
* @return
*/
private static String asConllString(Map<String, String> feats) {
StringBuffer stringBuffer = new StringBuffer();
for (Map.Entry<String, String> entry : feats.entrySet()) {
stringBuffer.append(entry.getKey());
stringBuffer.append("=");
stringBuffer.append(entry.getValue());
stringBuffer.append("|");
}
String s = stringBuffer.toString().trim();
return s.substring(0, s.length() - 1);
}
/**
* Extracts the CoNLL features.
*
* @param infra
* @param defMap
* @param featMap
* @return
*/
private static String getFeatures(String infra, Map<String, String> defMap, Map<String, List<ConllPair>> featMap) {
Map<String, String> features = new LinkedHashMap<>(defMap);
for (String infraFeat : featMap.keySet()) {
if (infra.contains(infraFeat)) {
for (ConllPair conllPair : featMap.get(infraFeat)) {
features.put(conllPair.feat, conllPair.value);
}
}
}
return asConllString(features);
}
/**
* Extracts the CoNLL features.
*
* @param infra
* @return
*/
public static String getFeatures(String infra, String form) {
if (!infra.contains("[")) {
return "_";
}
String pos = getPos(infra, form);
switch (pos) {
case "N":
return getFeatures(infra, NOUN_DEF_MAP, NOUN_INFRA_FEATURE_MAP);
case "V":
return getFeatures(infra, VERB_DEF_MAP, VERB_INFRA_FEATURE_MAP);
case "Det":
return getFeatures(infra, DET_DEF_MAP, DET_INFRA_FEATURE_MAP);
case "Cnj":
return getFeatures(infra, CONJ_DEF_MAP, CONJ_INFRA_FEATURE_MAP);
case "Adv":
return getFeatures(infra, ADV_DEF_MAP, ADV_INFRA_FEATURE_MAP);
case "Adj":
return getFeatures(infra, ADJ_DEF_MAP, ADJ_INFRA_FEATURE_MAP);
case "Num":
return getFeatures(infra, NUM_DEF_MAP, NUM_INFRA_FEATURE_MAP);
case "Post":
return getFeatures(infra, POST_DEF_MAP, POST_INFRA_FEATURE_MAP);
case "Inj-Utt":
return getFeatures(infra, INJ_DEF_MAP, INJ_INFRA_FEATURE_MAP);
}
return null;
}
/**
* Get the POS.
*
* @param infraAna
* @return
*/
public static String getPos(String infraAna, String form) {
if (!infraAna.contains("[")) {
return form;
}
String corrected = infraAna;
if (!infraAna.contains("[/") && !infraAna.equals("OTHER")) {
corrected = corrected.substring(corrected.indexOf("["));
corrected = posCorrector(corrected);
}
String pos = corrected.substring(corrected.indexOf("[/") + 2);
pos = pos.substring(0, pos.indexOf("]"));
if (pos.contains("|")) {
pos = pos.substring(0, pos.indexOf("|"));
}
if (pos.contains("Supl")) {
infraAna = infraAna.replace("[/Supl]", "");
getPos(infraAna, form);
}
return pos;
}
/**
* POS correction.
*
* @param pos
* @return
*/
private static String posCorrector(String pos) {
if (pos.startsWith("[Adj]")) {
return pos.replace("[Adj]", "[/Adj]");
}
if (pos.startsWith("[N]")) {
return pos.replace("[N]", "[/N]");
}
return pos;
}
private static String getLemma(String infra, String form) {
String infraLemma = infra.contains("[") ? infra.substring(0, infra.indexOf("[")) : form;
return infraLemma;
}
public static void main(String[] args) {
////hétfő N SubPOS=c|Num=s|Cas=p|NumP=none|PerP=none|NumPd=none
System.out.println(getLemma("hétfő[/N][Supe]", "Hétfőn"));
System.out.println(getPos("hétfő[/N][Supe]", "Hétfőn"));
System.out.println(getFeatures("hétfő[/N][Supe]", "Hétfőn"));
System.out.println();
////folytatódik V SubPOS=m|Mood=i|Tense=p|Per=3|Num=s|Def=n
System.out.println(getLemma("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));
System.out.println(getPos("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));
System.out.println(getFeatures("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));
System.out.println();
////az T SubPOS=f
System.out.println(getLemma("az[/Det|art.Def]", "az"));
System.out.println(getPos("az[/Det|art.Def]", "az"));
System.out.println(getFeatures("az[/Det|art.Def]", "az"));
System.out.println();
////. . _
System.out.println(getLemma("OTHER", "."));
System.out.println(getPos("OTHER", "."));
System.out.println(getFeatures("OTHER", "."));
System.out.println();
////hongkongi A SubPOS=f|Deg=p|Num=s|Cas=n|NumP=none|PerP=none|NumPd=none
System.out.println(getLemma("hongkongi[Adj][Nom]", "hongkongi"));
System.out.println(getPos("hongkongi[Adj][Nom]", "hongkongi"));
System.out.println(getFeatures("hongkongi[Adj][Nom]", "hongkongi"));
}
}
More information about the nlp-infra-devel
mailing list