[nlp-infra-devel] kérés: írjatok a listára, amint vmi elkészül -- newsml korpusz, tanítás

János Zsibrita zsibrita.janos at gmail.com
Tue Aug 2 13:51:50 CEST 2016


Sziasztok,

Mellékeltem a java kódot.

2016.08.02. 12:13 keltezéssel, Sass Bálint írta:
> Kedves Jani!
>
> Ez tök jó!
> Küldd el légyszi a java kódot, az nagyon is járható út. :)
>
> Az 'Adv' és a 'SubPos=x' között még van egy TAB, azaz 8 mező van, ugye?
> Ezek: id token lemma pos feat depTarget depType
>
> A token/lemma/pos/feat formátum a dep-re és konst-ra is vonatkozik, ugye?
>
> Kösz szépen:
> Bálint
>
> Ezt írtad ma:
>> Sziasztok,
>>
>> Egy példasor a trainből:
>>
>> 19 ismét ismét AdvSubPOS=x|Deg=none|Num=none|Per=none 20 MODE
>>
>> ahol
>> - az Adv a hfst által megadott szófaj
>> - a SubPOS=x|Deg=none|Num=none|Per=none pedig a morfológiai jegyekből 
>> kinyert feature-ök
>>
>> A szófafaj kinyerése viszonylag egyszerű, 1-2 (egyelőre) hibás hfts 
>> output esetet (pl. [Adj][Nom]) leszámítva.
>> A morfológiai feature-ök a mate számára pedig a CoNLL 2009 formátum 
>> alapján történnek.
>>
>> Mindkettőre van nagyon megírt, egyszerű java kódom, amit el tudok 
>> küldeni, amennyiben a java járható út.
>

-------------- next part --------------
package util;

import java.util.*;

/**
 * Created by zsibritajanos on 2016.08.02..
 */
public class DepTool {

  /**
   * Default
   */
  private static final String FEATURED_DEF_VALUE = "none";

  /**
   * NOUN
   */
  private static final Map<String, String> NOUN_DEF_MAP = new LinkedHashMap<>();

  static {
    NOUN_DEF_MAP.put("SubPOS", "c");
    NOUN_DEF_MAP.put("Num", "s");
    NOUN_DEF_MAP.put("Cas", "n");
    NOUN_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
    NOUN_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
    NOUN_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
  }

  /**
   * VERB
   */
  private static final Map<String, String> VERB_DEF_MAP = new LinkedHashMap<>();

  static {
    VERB_DEF_MAP.put("SubPOS", "m");
    VERB_DEF_MAP.put("Mood", "i");
    VERB_DEF_MAP.put("Tense", "s");
    VERB_DEF_MAP.put("Per", FEATURED_DEF_VALUE);
    VERB_DEF_MAP.put("Num", FEATURED_DEF_VALUE);
    VERB_DEF_MAP.put("Def", FEATURED_DEF_VALUE);
  }

  /**
   * DET
   */
  private static final Map<String, String> DET_DEF_MAP = new LinkedHashMap<>();

  static {
    DET_DEF_MAP.put("SubPOS", "f");
  }

  /**
   * CONJ
   */
  private static final Map<String, String> CONJ_DEF_MAP = new LinkedHashMap<>();

  static {
    CONJ_DEF_MAP.put("SubPOS", "c");
    CONJ_DEF_MAP.put("Form", "s");
    CONJ_DEF_MAP.put("Coord", "w");
  }


  /**
   * ADV
   */
  private static final Map<String, String> ADV_DEF_MAP = new LinkedHashMap<>();

  static {
    ADV_DEF_MAP.put("SubPOS", "x");
    ADV_DEF_MAP.put("Deg", FEATURED_DEF_VALUE);
    ADV_DEF_MAP.put("Num", FEATURED_DEF_VALUE);
    ADV_DEF_MAP.put("Per", FEATURED_DEF_VALUE);
  }

  /**
   * ADJ
   */
  private static final Map<String, String> ADJ_DEF_MAP = new LinkedHashMap<>();

  static {
    ADJ_DEF_MAP.put("SubPOS", "f");
    ADJ_DEF_MAP.put("Deg", "p");
    ADJ_DEF_MAP.put("Num", "s");
    ADJ_DEF_MAP.put("Cas", "n");
    ADJ_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
    ADJ_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
    ADJ_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
  }

  /**
   * NUM
   */
  private static final Map<String, String> NUM_DEF_MAP = new LinkedHashMap<>();

  static {
    NUM_DEF_MAP.put("SubPOS", "c");
    NUM_DEF_MAP.put("Num", "s");
    NUM_DEF_MAP.put("Cas", "n");
    NUM_DEF_MAP.put("Form", "d");
    NUM_DEF_MAP.put("NumP", FEATURED_DEF_VALUE);
    NUM_DEF_MAP.put("PerP", FEATURED_DEF_VALUE);
    NUM_DEF_MAP.put("NumPd", FEATURED_DEF_VALUE);
  }

  /**
   * INJ
   */
  private static final Map<String, String> INJ_DEF_MAP = new LinkedHashMap<>();

  static {
    INJ_DEF_MAP.put("SubPOS", "o");
  }

  /**
   * POST
   */
  private static final Map<String, String> POST_DEF_MAP = new LinkedHashMap<>();

  static {
    POST_DEF_MAP.put("SubPOS", "t");
  }


  /**
   * NOUN
   */
  private static final Map<String, List<ConllPair>> NOUN_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    // cas
    NOUN_INFRA_FEATURE_MAP.put("[Supe]", Arrays.asList(new ConllPair("Cas", "p")));
    NOUN_INFRA_FEATURE_MAP.put("[Ins]", Arrays.asList(new ConllPair("Cas", "i")));

    NOUN_INFRA_FEATURE_MAP.put("[Dat]", Arrays.asList(new ConllPair("Cas", "g")));
    NOUN_INFRA_FEATURE_MAP.put("[All]", Arrays.asList(new ConllPair("Cas", "t")));

    NOUN_INFRA_FEATURE_MAP.put("[Nom]", Arrays.asList(new ConllPair("Cas", "n")));

    NOUN_INFRA_FEATURE_MAP.put("[Acc]", Arrays.asList(new ConllPair("Cas", "a")));
    NOUN_INFRA_FEATURE_MAP.put("[Subl]", Arrays.asList(new ConllPair("Cas", "s")));

    NOUN_INFRA_FEATURE_MAP.put("[Ine]", Arrays.asList(new ConllPair("Cas", "2")));
    NOUN_INFRA_FEATURE_MAP.put("[Ade]", Arrays.asList(new ConllPair("Cas", "3")));

    NOUN_INFRA_FEATURE_MAP.put("[Pl]", Arrays.asList(new ConllPair("Num", "p")));

    NOUN_INFRA_FEATURE_MAP.put("[Poss.3Sg]", Arrays.asList(new ConllPair("Num", "s"), new ConllPair("NumP", "s"), new ConllPair("PerP", "3")));
    NOUN_INFRA_FEATURE_MAP.put("[Pl.Poss.3Sg]", Arrays.asList(new ConllPair("Num", "p"), new ConllPair("NumP", "s"), new ConllPair("PerP", "3")));
  }

  /**
   * VERB
   */
  private static final Map<String, List<ConllPair>> VERB_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    VERB_INFRA_FEATURE_MAP.put("[Pst.NDef.3Sg]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "n")));
    VERB_INFRA_FEATURE_MAP.put("[Pst.Def.3Sg]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "y")));

    VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.3Sg]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "n")));
    VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.3Pl]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));

    VERB_INFRA_FEATURE_MAP.put("[Prs.Def.3Sg]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "s"), new ConllPair("Def", "y")));
    VERB_INFRA_FEATURE_MAP.put("[Prs.Def.3Pl]", Arrays.asList(new ConllPair("Tense", "p"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "y")));
    VERB_INFRA_FEATURE_MAP.put("[Prs.NDef.2Pl]", Arrays.asList(new ConllPair("Tense", "s"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));

    VERB_INFRA_FEATURE_MAP.put("[Pst.NDef.3Pl]", Arrays.asList(new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));
    VERB_INFRA_FEATURE_MAP.put("[Pst.Def.3Pl]", Arrays.asList(new ConllPair("Tense", "s"), new ConllPair("Per", "3"), new ConllPair("Num", "p"), new ConllPair("Def", "y")));

    VERB_INFRA_FEATURE_MAP.put("[Inf]", Arrays.asList(new ConllPair("Mood", "i"), new ConllPair("Num", "p"), new ConllPair("Def", "n")));

    VERB_INFRA_FEATURE_MAP.put("[_Caus/V]", Arrays.asList(new ConllPair("SubPOS", "s")));
    VERB_INFRA_FEATURE_MAP.put("[_Mod/V]", Arrays.asList(new ConllPair("SubPOS", "o")));
    VERB_INFRA_FEATURE_MAP.put("[_Freq/V]", Arrays.asList(new ConllPair("SubPOS", "f")));
  }

  /**
   * CONJ
   */
  private static final Map<String, List<ConllPair>> CONJ_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    //
  }

  /**
   * ADJ
   */
  private static final Map<String, List<ConllPair>> ADJ_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    ADJ_INFRA_FEATURE_MAP.put("[_Manner/Adv]", Arrays.asList(new ConllPair("Cas", "w")));
    ADJ_INFRA_FEATURE_MAP.put("[_Comp/Adj][Nom]", Arrays.asList(new ConllPair("Deg", "c")));
    ADJ_INFRA_FEATURE_MAP.put("[/Adj][Ade]", Arrays.asList(new ConllPair("Cas", "3")));
  }

  /**
   * NUM
   */
  private static final Map<String, List<ConllPair>> NUM_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    NUM_INFRA_FEATURE_MAP.put("[/Num|Digit]", Arrays.asList(new ConllPair("SubPOS", "f")));
    NUM_INFRA_FEATURE_MAP.put("[_Ord/Adj][Nom]", Arrays.asList(new ConllPair("SubPOS", "o")));
  }

  /**
   * INJ
   */
  private static final Map<String, List<ConllPair>> INJ_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    //
  }

  /**
   * POST
   */
  private static final Map<String, List<ConllPair>> POST_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    //
  }

  /**
   * ADV
   */
  private static final Map<String, List<ConllPair>> ADV_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    ADV_INFRA_FEATURE_MAP.put("[/Adv|Pro]", Arrays.asList(new ConllPair("SubPOS", "d")));
  }

  /**
   * DET
   */
  private static final Map<String, List<ConllPair>> DET_INFRA_FEATURE_MAP = new TreeMap<>();

  static {
    //
  }


  /**
   * CoNLL feature key-value pair.
   */
  public static class ConllPair {

    public final String feat;
    public final String value;

    public ConllPair(String feat, String value) {
      this.feat = feat;
      this.value = value;
    }
  }

  /**
   * Map to CoNLL feature String.
   *
   * @param feats
   * @return
   */
  private static String asConllString(Map<String, String> feats) {
    StringBuffer stringBuffer = new StringBuffer();

    for (Map.Entry<String, String> entry : feats.entrySet()) {
      stringBuffer.append(entry.getKey());
      stringBuffer.append("=");
      stringBuffer.append(entry.getValue());
      stringBuffer.append("|");
    }

    String s = stringBuffer.toString().trim();

    return s.substring(0, s.length() - 1);
  }

  /**
   * Extracts the CoNLL features.
   *
   * @param infra
   * @param defMap
   * @param featMap
   * @return
   */
  private static String getFeatures(String infra, Map<String, String> defMap, Map<String, List<ConllPair>> featMap) {
    Map<String, String> features = new LinkedHashMap<>(defMap);

    for (String infraFeat : featMap.keySet()) {
      if (infra.contains(infraFeat)) {
        for (ConllPair conllPair : featMap.get(infraFeat)) {
          features.put(conllPair.feat, conllPair.value);
        }
      }
    }

    return asConllString(features);
  }

  /**
   * Extracts the CoNLL features.
   *
   * @param infra
   * @return
   */
  public static String getFeatures(String infra, String form) {

    if (!infra.contains("[")) {
      return "_";
    }

    String pos = getPos(infra, form);

    switch (pos) {
      case "N":
        return getFeatures(infra, NOUN_DEF_MAP, NOUN_INFRA_FEATURE_MAP);
      case "V":
        return getFeatures(infra, VERB_DEF_MAP, VERB_INFRA_FEATURE_MAP);
      case "Det":
        return getFeatures(infra, DET_DEF_MAP, DET_INFRA_FEATURE_MAP);
      case "Cnj":
        return getFeatures(infra, CONJ_DEF_MAP, CONJ_INFRA_FEATURE_MAP);
      case "Adv":
        return getFeatures(infra, ADV_DEF_MAP, ADV_INFRA_FEATURE_MAP);
      case "Adj":
        return getFeatures(infra, ADJ_DEF_MAP, ADJ_INFRA_FEATURE_MAP);
      case "Num":
        return getFeatures(infra, NUM_DEF_MAP, NUM_INFRA_FEATURE_MAP);
      case "Post":
        return getFeatures(infra, POST_DEF_MAP, POST_INFRA_FEATURE_MAP);
      case "Inj-Utt":
        return getFeatures(infra, INJ_DEF_MAP, INJ_INFRA_FEATURE_MAP);
    }

    return null;
  }

  /**
   * Get the POS.
   *
   * @param infraAna
   * @return
   */
  public static String getPos(String infraAna, String form) {

    if (!infraAna.contains("[")) {
      return form;
    }

    String corrected = infraAna;
    if (!infraAna.contains("[/") && !infraAna.equals("OTHER")) {
      corrected = corrected.substring(corrected.indexOf("["));
      corrected = posCorrector(corrected);
    }

    String pos = corrected.substring(corrected.indexOf("[/") + 2);

    pos = pos.substring(0, pos.indexOf("]"));

    if (pos.contains("|")) {
      pos = pos.substring(0, pos.indexOf("|"));
    }

    if (pos.contains("Supl")) {
      infraAna = infraAna.replace("[/Supl]", "");
      getPos(infraAna, form);
    }

    return pos;
  }

  /**
   * POS correction.
   *
   * @param pos
   * @return
   */
  private static String posCorrector(String pos) {

    if (pos.startsWith("[Adj]")) {
      return pos.replace("[Adj]", "[/Adj]");
    }

    if (pos.startsWith("[N]")) {
      return pos.replace("[N]", "[/N]");
    }

    return pos;
  }

  private static String getLemma(String infra, String form) {
    String infraLemma = infra.contains("[") ? infra.substring(0, infra.indexOf("[")) : form;
    return infraLemma;
  }

  public static void main(String[] args) {

////hétfő	N	SubPOS=c|Num=s|Cas=p|NumP=none|PerP=none|NumPd=none
    System.out.println(getLemma("hétfő[/N][Supe]", "Hétfőn"));
    System.out.println(getPos("hétfő[/N][Supe]", "Hétfőn"));
    System.out.println(getFeatures("hétfő[/N][Supe]", "Hétfőn"));

    System.out.println();

////folytatódik	V	SubPOS=m|Mood=i|Tense=p|Per=3|Num=s|Def=n
    System.out.println(getLemma("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));
    System.out.println(getPos("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));
    System.out.println(getFeatures("folytatódik[/V][Prs.NDef.3Sg]", "folytatódik"));

    System.out.println();

////az	T	SubPOS=f
    System.out.println(getLemma("az[/Det|art.Def]", "az"));
    System.out.println(getPos("az[/Det|art.Def]", "az"));
    System.out.println(getFeatures("az[/Det|art.Def]", "az"));

    System.out.println();

////. . _
    System.out.println(getLemma("OTHER", "."));
    System.out.println(getPos("OTHER", "."));
    System.out.println(getFeatures("OTHER", "."));

    System.out.println();

////hongkongi	A	SubPOS=f|Deg=p|Num=s|Cas=n|NumP=none|PerP=none|NumPd=none
    System.out.println(getLemma("hongkongi[Adj][Nom]", "hongkongi"));
    System.out.println(getPos("hongkongi[Adj][Nom]", "hongkongi"));
    System.out.println(getFeatures("hongkongi[Adj][Nom]", "hongkongi"));
  }
}


More information about the nlp-infra-devel mailing list