All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.clulab.reach.biogrammar.entities.entities.yml Maven / Gradle / Ivy

The newest version!
###################
# Site rules
###################

- name: site_long
  label: Site
  priority: 1
  type: token
  action: mkBioMention
  pattern: |
    [word = /(?i)^(alanine|arginine|asparagine|aspartic|aspartate|cysteine|glutamic|glutamate|glutamine|glycine|histidine|isoleucine|leucine|lysine|methionine|phenylalanine|proline|serine|threonine|tryptophan|tyrosine|valine)$/]
    [word = /acid/]?
    [word = /(?i)residue/]?
    [word = /\d{3}$/]?

# this one is ambiguous: there are proteins that have the same form; in such cases, prefer the proteins or chemicals
- name: site_1letter_a
  label: Site
  priority: 5
  type: token
  action: mkBioMention
  pattern: |
    (?")

# NB: the period is only there to handle errorful tokenization by processors at sentence endings.
- name: site_1letter_c
  label: Site
  priority: 5
  type: token
  action: mkBioMention
  pattern: |
    (? /[A-Za-z]?\d+$/ (?! /[ACDEFGHIKLMNQRSTVWY]|mM/))))+ # note that (?! "mM") is to avoid capturing measurements
    [word=")"]?

###################
# ner rules
###################

- name: ner-species
  label: Species
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Species'] [entity='I-Species']*

- name: ner-cell-lines
  label: CellLine
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-CellLine'] [entity='I-CellLine']*

- name: ner-organ
  label: Organ
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Organ'] [entity='I-Organ']*

- name: ner-cell-type
  label: CellType
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-CellType'] [entity='I-CellType']*

- name: ner-tissue-type
  label: TissueType
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-TissueType'] [entity='I-TissueType']*

# - name: ner-infered-cell-type
#   label: [CellType]
#   priority: 4
#   type: token
#   pattern: |
#     @Organ [lemma=/^(cells?|tissue|fluids?)/]

- name: ner-gene_or_gene_product-entities
  label: Gene_or_gene_product
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]+
    # digits are fine
    ([entity='I-Gene_or_gene_product' & word=/^\d+$/] |
    # avoid likely mutants, etc.
    [entity='I-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/])*
    # we may see the word "protein"
    # we can't always trust the crf.  Make sure it isn't an obvious family
    (?!
    [lemma=substrate | lemma=family]
    |
    [lemma=protein] [lemma=family]
    |
    [lemma=inhibitor & !entity=/Gene_or_gene_product/]
    )

# TODO: verify this ontology
- name: ner-family-entities
  label: Family
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Family']+ [entity='I-Family']* (?! [lemma=substrate] | [lemma=inhibitor & entity=/Family/])
    | # we can't always trust the crf.
    [entity='B-Gene_or_gene_product']+ [entity='I-Gene_or_gene_product']*
    # only allow the GGP ner label iff lemma of next tok is "family"
    (?=  [lemma=protein]? [lemma="family"])

# TODO: verify this ontology
- name: ner-cellular_component-entities
  label: [Cellular_component]
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Cellular_component'] [entity='I-Cellular_component']*

- name: ner-simple_chemical-entities
  label: Simple_chemical
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Simple_chemical'] [entity='I-Simple_chemical']* (?! [lemma="substrate"])

- name: ner-domain-entities
  label: Site
  action: mkNERMentions
  priority: 3
  type: token
  pattern: |
    [entity='B-Site'] [entity='I-Site']*

- name: ner-bioprocess-entities
  label: BioProcess
  action: mkNERMentions
  example: "apoptosis"
  priority: 3
  type: token
  pattern: |
    [entity='B-BioProcess'] [entity='I-BioProcess']*

# TODO: Ideally, this should somehow be added to the KB
- name: missing-simple_chemical
  label: Simple_chemical
  action: mkBioMention
  priority: 4
  type: token
  pattern: |
    [word=/(?i)\-G[TD]P/ & word=/^\d/] # this also allows for 32P-GTP, etc.

# TODO: What labels should these guys have?
- name: missing-location
  label: Gene_or_gene_product
  action: mkBioMention
  priority: 4
  type: token
  pattern: |
    @BioChemicalEntity /(?i)receptor/ # there must be other relevant terms.  This rule comes from us finding "ERBB", but missing "ERBB receptors"

- name: protein-inhibitor
  label: Simple_chemical
  action: mkBioMention
  priority: 2
  type: token
  pattern: |
    (
    [entity='B-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]+
    [entity='I-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]*
    |
    [entity='B-Family']+
    [entity='I-Family']*
    )
    [lemma=inhibitor & !entity=/^(B|I)/]




© 2015 - 2025 Weber Informatics LLC | Privacy Policy