org.clulab.reach.biogrammar.entities.entities.yml Maven / Gradle / Ivy
The newest version!
###################
# Site rules
###################
- name: site_long
label: Site
priority: 1
type: token
action: mkBioMention
pattern: |
[word = /(?i)^(alanine|arginine|asparagine|aspartic|aspartate|cysteine|glutamic|glutamate|glutamine|glycine|histidine|isoleucine|leucine|lysine|methionine|phenylalanine|proline|serine|threonine|tryptophan|tyrosine|valine)$/]
[word = /acid/]?
[word = /(?i)residue/]?
[word = /\d{3}$/]?
# this one is ambiguous: there are proteins that have the same form; in such cases, prefer the proteins or chemicals
- name: site_1letter_a
label: Site
priority: 5
type: token
action: mkBioMention
pattern: |
(?")
# NB: the period is only there to handle errorful tokenization by processors at sentence endings.
- name: site_1letter_c
label: Site
priority: 5
type: token
action: mkBioMention
pattern: |
(? /[A-Za-z]?\d+$/ (?! /[ACDEFGHIKLMNQRSTVWY]|mM/))))+ # note that (?! "mM") is to avoid capturing measurements
[word=")"]?
###################
# ner rules
###################
- name: ner-species
label: Species
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Species'] [entity='I-Species']*
- name: ner-cell-lines
label: CellLine
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-CellLine'] [entity='I-CellLine']*
- name: ner-organ
label: Organ
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Organ'] [entity='I-Organ']*
- name: ner-cell-type
label: CellType
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-CellType'] [entity='I-CellType']*
- name: ner-tissue-type
label: TissueType
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-TissueType'] [entity='I-TissueType']*
# - name: ner-infered-cell-type
# label: [CellType]
# priority: 4
# type: token
# pattern: |
# @Organ [lemma=/^(cells?|tissue|fluids?)/]
- name: ner-gene_or_gene_product-entities
label: Gene_or_gene_product
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]+
# digits are fine
([entity='I-Gene_or_gene_product' & word=/^\d+$/] |
# avoid likely mutants, etc.
[entity='I-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/])*
# we may see the word "protein"
# we can't always trust the crf. Make sure it isn't an obvious family
(?!
[lemma=substrate | lemma=family]
|
[lemma=protein] [lemma=family]
|
[lemma=inhibitor & !entity=/Gene_or_gene_product/]
)
# TODO: verify this ontology
- name: ner-family-entities
label: Family
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Family']+ [entity='I-Family']* (?! [lemma=substrate] | [lemma=inhibitor & entity=/Family/])
| # we can't always trust the crf.
[entity='B-Gene_or_gene_product']+ [entity='I-Gene_or_gene_product']*
# only allow the GGP ner label iff lemma of next tok is "family"
(?= [lemma=protein]? [lemma="family"])
# TODO: verify this ontology
- name: ner-cellular_component-entities
label: [Cellular_component]
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Cellular_component'] [entity='I-Cellular_component']*
- name: ner-simple_chemical-entities
label: Simple_chemical
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Simple_chemical'] [entity='I-Simple_chemical']* (?! [lemma="substrate"])
- name: ner-domain-entities
label: Site
action: mkNERMentions
priority: 3
type: token
pattern: |
[entity='B-Site'] [entity='I-Site']*
- name: ner-bioprocess-entities
label: BioProcess
action: mkNERMentions
example: "apoptosis"
priority: 3
type: token
pattern: |
[entity='B-BioProcess'] [entity='I-BioProcess']*
# TODO: Ideally, this should somehow be added to the KB
- name: missing-simple_chemical
label: Simple_chemical
action: mkBioMention
priority: 4
type: token
pattern: |
[word=/(?i)\-G[TD]P/ & word=/^\d/] # this also allows for 32P-GTP, etc.
# TODO: What labels should these guys have?
- name: missing-location
label: Gene_or_gene_product
action: mkBioMention
priority: 4
type: token
pattern: |
@BioChemicalEntity /(?i)receptor/ # there must be other relevant terms. This rule comes from us finding "ERBB", but missing "ERBB receptors"
- name: protein-inhibitor
label: Simple_chemical
action: mkBioMention
priority: 2
type: token
pattern: |
(
[entity='B-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]+
[entity='I-Gene_or_gene_product' & !word=/(?i)^([ACDEFGHIKLMNQRSTVWY]?\d+_|Delta)?[ACDEFGHIKLMNQRSTVWY]?\d+(del|ins|dup|fs)?[ACDEFGHIKLMNPQRSTVWY]*$/]*
|
[entity='B-Family']+
[entity='I-Family']*
)
[lemma=inhibitor & !entity=/^(B|I)/]
© 2015 - 2025 Weber Informatics LLC | Privacy Policy