All Downloads are FREE. Search and download functionalities are using the official Maven repository.

wittertext.twitter-text.2.0.8.source-code.extract.yml Maven / Gradle / Ivy

There is a newer version: 3.1.0
Show newest version
tests:
  mentions:
    - description: "Extract mention at the begining of a tweet"
      text: "@username reply"
      expected: ["username"]

    - description: "Extract mention at the end of a tweet"
      text: "mention @username"
      expected: ["username"]

    - description: "Extract mention in the middle of a tweet"
      text: "mention @username in the middle"
      expected: ["username"]

    - description: "Extract mention of username with underscore"
      text: "mention @user_name"
      expected: ["user_name"]

    - description: "Extract mention of all numeric username"
      text: "mention @12345"
      expected: ["12345"]

    - description: "Extract mention or multiple usernames"
      text: "mention @username1 @username2"
      expected: ["username1", "username2"]

    - description: "Extract mention in the middle of a Japanese tweet"
      text: "の@usernameに到着を待っている"
      expected: ["username"]

    - description: "DO NOT extract username ending in @"
      text: "Current Status: @_@ (cc: @username)"
      expected: ["username"]

    - description: "DO NOT extract username followed by accented latin characters"
      text: "@aliceìnheiro something something"
      expected: []

    - description: "Extract lone metion but not @user@user (too close to an email)"
      text: "@username email me @[email protected]"
      expected: ["username"]

    - description: "DO NOT extract 'http' in '@http://' as username"
      text: "@http://twitter.com"
      expected: []

    - description: "Extract mentions before newline"
      text: "@username\n@mention"
      expected: ["username", "mention"]

    - description: "Extract mentions after 'RT'"
      text: "RT@username RT:@mention RT @test"
      expected: ["username", "mention", "test"]

    - description: "Extract mentions after 'rt'"
      text: "rt@username rt:@mention rt @test"
      expected: ["username", "mention", "test"]

    - description: "Extract mentions after 'Rt'"
      text: "Rt@username Rt:@mention Rt @test"
      expected: ["username", "mention", "test"]

    - description: "Extract mentions after 'rT'"
      text: "rT@username rT:@mention rT @test"
      expected: ["username", "mention", "test"]

    - description: "DO NOT extract username preceded by !"
      text: "f!@kn"
      expected: []

    - description: "DO NOT extract username preceded by @"
      text: "f@@kn"
      expected: []

    - description: "DO NOT extract username preceded by #"
      text: "f#@kn"
      expected: []

    - description: "DO NOT extract username preceded by $"
      text: "f$@kn"
      expected: []

    - description: "DO NOT extract username preceded by %"
      text: "f%@kn"
      expected: []

    - description: "DO NOT extract username preceded by &"
      text: "f&@kn"
      expected: []

    - description: "DO NOT extract username preceded by *"
      text: "f*@kn"
      expected: []

  mentions_with_indices:
    - description: "Extract a mention at the start"
      text: "@username yo!"
      expected:
        - screen_name: "username"
          indices: [0, 9]

    - description: "Extract a mention that has the same thing mentioned at the start"
      text: "username @username"
      expected:
        - screen_name: "username"
          indices: [9, 18]

    - description: "Extract a mention in the middle of a Japanese tweet"
      text: "の@usernameに到着を待っている"
      expected:
        - screen_name: "username"
          indices: [1, 10]

  mentions_or_lists_with_indices:
    - description: "Extract a mention"
      text: "@username yo!"
      expected:
        - screen_name: "username"
          list_slug: ""
          indices: [0, 9]

    - description: "Extract a list"
      text: "@username/list-name is a great list!"
      expected:
        - screen_name: "username"
          list_slug: "/list-name"
          indices: [0, 19]

    - description: "Extract a mention and list"
      text: "Hey @username, check out out @otheruser/list_name-01!"
      expected:
        - screen_name: "username"
          list_slug: ""
          indices: [4, 13]
        - screen_name: "otheruser"
          list_slug: "/list_name-01"
          indices: [29, 52]

    - description: "Extract a list in the middle of a Japanese tweet"
      text: "の@username/list_name-01に到着を待っている"
      expected:
        - screen_name: "username"
          list_slug: "/list_name-01"
          indices: [1, 23]

    - description: "DO NOT extract a list with slug that starts with a number"
      text: "@username/7list-name is a great list!"
      expected:
        - screen_name: "username"
          list_slug: ""
          indices: [0, 9]

  replies:
    - description: "Extract reply at the begining of a tweet"
      text: "@username reply"
      expected: "username"

    - description: "Extract reply preceded by only a space"
      text: " @username reply"
      expected: "username"

    - description: "Extract reply preceded by only a full-width space (U+3000)"
      text: " @username reply"
      expected: "username"

    - description: "DO NOT Extract reply when preceded by text"
      text: "a @username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by ."
      text: ".@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by /"
      text: "/@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by _"
      text: "_@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by -"
      text: "-@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by +"
      text: "+@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by #"
      text: "#@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by !"
      text: "!@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by @"
      text: "@@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when followed by URL"
      text: "@http://twitter.com"
      expected:

  urls:
    - description: "Extract a lone URL"
      text: "http://example.com"
      expected: ["http://example.com"]

    - description: "Extract a lone unicode url"
      text: "http://ああ.com"
      expected: ["http://ああ.com"]

    - description: "Extract a lone unicode url with -"
      text: "http://あ-あ.com"
      expected: ["http://あ-あ.com"]

    - description: "Extract valid URL: http://google.com"
      text: "text http://google.com"
      expected: ["http://google.com"]

    - description: "Extract valid URL: http://foobar.com/#"
      text: "text http://foobar.com/#"
      expected: ["http://foobar.com/#"]

    - description: "Extract valid URL: http://google.com/#foo"
      text: "text http://google.com/#foo"
      expected: ["http://google.com/#foo"]

    - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
      text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
      expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]

    - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
      text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
      expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]

    - description: "Extract valid URL: http://somedomain.com/index.php?path=/abc/def/"
      text: "text http://somedomain.com/index.php?path=/abc/def/"
      expected: ["http://somedomain.com/index.php?path=/abc/def/"]

    - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
      text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
      expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]

    - description: "Extract valid URL: http://somehost.com:3000"
      text: "text http://somehost.com:3000"
      expected: ["http://somehost.com:3000"]

    - description: "Extract valid URL: http://xo.com/~matthew+%ff-x"
      text: "text http://xo.com/~matthew+%ff-x"
      expected: ["http://xo.com/~matthew+%ff-x"]

    - description: "Extract valid URL: http://xo.com/~matthew+%ff-,.;x"
      text: "text http://xo.com/~matthew+%ff-,.;x"
      expected: ["http://xo.com/~matthew+%ff-,.;x"]

    - description: "Extract valid URL: http://xo.com/,.;x"
      text: "text http://xo.com/,.;x"
      expected: ["http://xo.com/,.;x"]

    - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
      text: "text http://en.wikipedia.org/wiki/Primer_(film)"
      expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]

    - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
      text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
      expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]

    - description: "Extract valid URL: http://✪df.ws/ejp"
      text: "text http://✪df.ws/ejp"
      expected: ["http://✪df.ws/ejp"]

    - description: "Extract valid URL: http://example.com/"
      text: "test http://example.comだよね.comtest/hogehoge"
      expected: ["http://example.com"]

    - description: "Extract valid URL: http://chilp.it/?77e8fd"
      text: "text http://chilp.it/?77e8fd"
      expected: ["http://chilp.it/?77e8fd"]

    - description: "Extract valid URL: http://x.com/oneletterdomain"
      text: "text http://x.com/oneletterdomain"
      expected: ["http://x.com/oneletterdomain"]

    - description: "Extract valid URL: http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"
      text: "text http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"
      expected: ["http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"]

    - description: "Extract valid URL with hyphen as query ending char: https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"
      text: "text https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"
      expected: ["https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"]

    - description: "DO NOT extract invalid URL: http://no-tld"
      text: "text http://no-tld"
      expected: []

    - description: "DO NOT extract invalid URL: http://tld-too-short.x"
      text: "text http://tld-too-short.x"
      expected: []

    - description: "DO NOT extract invalid URL with invalid preceding character: (http://twitter.com"
      text: "(http://twitter.com"
      expected: ["http://twitter.com"]

    - description: "Extract a very long hyphenated sub-domain URL (single letter hyphens)"
      text: "text http://word-and-a-number-8-ftw.domain.com/"
      expected: ["http://word-and-a-number-8-ftw.domain.com/"]

    - description: "Extract a hyphenated TLD (usually a typo)"
      text: "text http://domain.com-that-you-should-have-put-a-space-after"
      expected: ["http://domain.com"]

    - description: "Extract URL ending with # value"
      text: "text http://foo.com?#foo text"
      expected: ["http://foo.com?#foo"]

    - description: "Extract URLs without protocol on (com|org|edu|gov|net) domains"
      text: "foo.com foo.net foo.org foo.edu foo.gov"
      expected: ["foo.com", "foo.net", "foo.org", "foo.edu", "foo.gov"]

    - description: "Extract URLs without protocol not on (com|org|edu|gov|net) domains"
      text: "foo.baz foo.co.jp www.xxxxxxx.baz www.foo.co.uk wwwww.xxxxxxx foo.comm foo.somecom foo.govedu foo.jp"
      expected: ["foo.co.jp", "www.foo.co.uk"]

    - description: "Extract URLs without protocol on ccTLD with slash"
      text: "t.co/abcde bit.ly/abcde"
      expected: ["t.co/abcde", "bit.ly/abcde"]

    - description: "Extract URLs with protocol on ccTLD domains"
      text: "http://foo.jp http://fooooo.jp"
      expected: ["http://foo.jp", "http://fooooo.jp"]

    - description: "Extract URLs with a - or + at the end of the path"
      text: "Go to http://example.com/a+ or http://example.com/a-"
      expected: ["http://example.com/a+", "http://example.com/a-"]

    - description: "Extract URLs with longer paths ending in -"
      text: "Go to http://example.com/view/slug-url-?foo=bar"
      expected: ["http://example.com/view/slug-url-?foo=bar"]

    - description: "Extract URLs with an en dash in the path"
      text: "Go to https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud please"
      expected: ["https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud"]

    - description: "Extract URLs beginning with a space"
      text: "@user Try http:// example.com/path"
      expected: ["example.com/path"]

    - description: "Extract long URL without protocol surrounded by CJK characters"
      text: "これは日本語です。example.com/path/index.html中国語example.com/path한국"
      expected: ["example.com/path/index.html", "example.com/path"]

    - description: "Extract short URL without protocol surrounded by CJK characters"
      text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde"
      expected: ["twitter.com", "example.com", "t.co/abcde", "twitter.com", "example2.com", "twitter.com/abcde"]

    - description: "Extract URLs with and without protocol surrounded by CJK characters"
      text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde"
      expected: ["http://twitter.com/", "example.com", "http://t.co/abcde", "twitter.com", "example2.com", "http://twitter.com/abcde"]

    - description: "Extract URLs with protocol and path containing Cyrillic characters"
      text: "Go to http://twitter.com/Русские_слова"
      expected: ["http://twitter.com/Русские_слова"]

    - description: "Extract non-ASCII host name URLs with protocol, but ignore host names bigger than 63 characters. Also handle exceptions for non-ASCII hostnames longer than 256 characters"
      text: "http://exampleこれは日本語です.com/path/index.html http://あああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああ.com/path/index.html"
      expected: ["http://exampleこれは日本語です.com/path/index.html"]

    - description: "DO NOT extract short URLs without protocol on ccTLD domains without path"
      text: "twitter.jp日本語it.so中国語foo.jp it.so foo.jp"
      expected: []

    - description: "DO NOT extract invalid URL"
      text: "Hello http://xn--はじめよう.com/index.html"
      expected: []

    - description: "DO NOT Extract URL with domain preceeded by underscore: http://domain-begin_dash_2314352345_dfasd.foo-cow_4352.com"
      text: "text http://domain-dash_2314352345_dfasd.foo-cow_4352.com"
      expected: []

    - description: "Extract some (tv|co) short URLs without protocol on ccTLD domains without path"
      text: "MLB.tv vine.co twitch.tv t.co"
      expected: ["MLB.tv", "vine.co", "twitch.tv", "t.co"]

    - description: "Extract URLs beginning with a non-breaking space (U+00A0)"
      text: "@user Try http:// example.com/path"
      expected: ["example.com/path"]

    - description: "Extract URLs with underscores and dashes in the subdomain"
      text: "test http://sub_domain-dash.twitter.com"
      expected: ["http://sub_domain-dash.twitter.com"]

    - description: "Extract URL with minimum number of valid characters"
      text: "test http://a.b.cd"
      expected: ["http://a.b.cd"]

    - description: "Extract URLs containing underscores and dashes"
      text: "test http://a_b.c-d.com"
      expected: ["http://a_b.c-d.com"]

    - description: "Extract URLs containing dashes in the subdomain"
      text: "test http://a-b.c.com"
      expected: ["http://a-b.c.com"]

    - description: "Extract URLs with dashes in the domain name"
      text: "test http://twitter-dash.com"
      expected: ["http://twitter-dash.com"]

    - description: "Extract URLs with lots of symbols then a period"
      text: "http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"
      expected: ["http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"]

    - description: "DO NOT extract URLs containing leading dashes in the subdomain"
      text: "test http://-leadingdash.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing leading dashes in the domain with a subdomain"
      text: "test http://leadingdash.-twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing dashes in the subdomain"
      text: "test http://trailingdash-.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing dashes in the domain with a subdomain"
      text: "test http://trailingdash.twitter-.com"
      expected: []

    - description: "DO NOT extract URLs containing leading underscores in the subdomain"
      text: "test http://_leadingunderscore.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing leading underscores in the domain with a subdomain"
      text: "test http://leadingunderscore._twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing underscores in the subdomain"
      text: "test http://trailingunderscore_.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing underscores in the domain with a subdomain"
      text: "test http://trailingunderscore.twitter_.com"
      expected: []

    - description: "DO NOT extract URLs containing leading dashes in the domain name"
      text: "test http://-twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing dashes in the domain name"
      text: "test http://twitter-.com"
      expected: []

    - description: "DO NOT extract URLs containing underscores in the domain name"
      text: "test http://twitter_underscore.com"
      expected: []

    - description: "DO NOT extract URLs containing underscores in the tld"
      text: "test http://twitter.c_o_m"
      expected: []

    - description: "Extract valid URL http://www.foo.com/foo/path-with-period./"
      text: "test http://www.foo.com/foo/path-with-period./"
      expected: ["http://www.foo.com/foo/path-with-period./"]

    - description: "Extract valid URL http://www.foo.org.za/foo/bar/688.1"
      text: "test http://www.foo.org.za/foo/bar/688.1"
      expected: ["http://www.foo.org.za/foo/bar/688.1"]

    - description: "Extract valid URL http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"
      text: "test http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"
      expected: ["http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"]

    - description: "Extract valid URL http://foo.com/bar/123/foo_&_bar/"
      text: "test http://foo.com/bar/123/foo_&_bar/"
      expected: ["http://foo.com/bar/123/foo_&_bar/"]

    - description: "Extract valid URL http://www.cp.sc.edu/events/65"
      text: "test http://www.cp.sc.edu/events/65 test"
      expected: ["http://www.cp.sc.edu/events/65"]

    - description: "Extract valid URL http://www.andersondaradio.no.comunidades.net/"
      text: "http://www.andersondaradio.no.comunidades.net/ test test"
      expected: ["http://www.andersondaradio.no.comunidades.net/"]

    - description: "Extract valid URL ELPAÍS.com"
      text: "test ELPAÍS.com"
      expected: ["ELPAÍS.com"]

    - description: "DO NOT include period at the end of URL"
      text: "test http://twitter.com/."
      expected: ["http://twitter.com/"]

    - description: "Extract a URL with '?' in fragment"
      text: "http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"
      expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"]

    - description: "Extract a URL with '?' in fragment in a text"
      text: "text http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata text"
      expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"]

   # A common cause of runaway regex engines.
    - description: "Extract a URL with a ton of trailing periods"
      text: "Test a ton of periods http://example.com/path.........................................."
      expected: ["http://example.com/path"]

    - description: "Extract a URL with a ton of trailing commas"
      text: "Test a ton of periods http://example.com/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
      expected: ["http://example.com/"]

    - description: "Extract a URL with a ton of trailing '!'"
      text: "Test a ton of periods http://example.com/path/!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
      expected: ["http://example.com/path/"]

    - description: "DO NOT extract URLs in hashtag or @mention"
      text: "#test.com @test.com #http://test.com @http://test.com #t.co/abcde @t.co/abcde"
      expected: []

    - description: "Extract a t.co URL with a trailing apostrophe"
      text: "I really like http://t.co/pbY2NfTZ's website"
      expected: ["http://t.co/pbY2NfTZ"]

    - description: "Extract a t.co URL with a trailing hyphen"
      text: "Check this site out http://t.co/FNkPfmii- it's great"
      expected: ["http://t.co/FNkPfmii"]

    - description: "Extract a t.co URL with a trailing colon"
      text: "According to http://t.co/ulYGBYSo: the internet is cool"
      expected: ["http://t.co/ulYGBYSo"]

    - description: "Extract a t.co URL with a long path"
      text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz0123456789"
      expected: ["http://t.co/abcdefghijklmnopqrstuvwxyz0123456789"]

    - description: "DO NOT extract URLs with > 40 characters in a t.co slug"
      text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz012345678901234"
      expected: []

    - description: "Extract domain followed by Japanese characters"
      text: "example.comてすとですtwitter.みんなです"
      expected: ["example.com", "twitter.みんな"]

    - description: "Extract URL before newline"
      text: "http://twitter.com\nhttp://example.com\nhttp://example.com/path\nexample.com/path\nit.so\nit.so/abcde"
      expected: ["http://twitter.com", "http://example.com", "http://example.com/path", "example.com/path", "it.so/abcde"]

    - description: "DO NOT extract URL if preceded by $"
      text: "$http://twitter.com $twitter.com $http://t.co/abcde $t.co/abcde $t.co $TVI.CA $RBS.CA"
      expected: []

    - description: "DO NOT extract .bz2 file name as URL"
      text: "long.test.tar.bz2 test.tar.bz2 tar.bz2"
      expected: []

    - description: "DO NOT extract URL with gTLD followed by @ sign"
      text: "[email protected]"
      expected: []

    - description: "DO NOT extract URL with ccTLD followed by @ sign"
      text: "[email protected]"
      expected: []

  urls_with_indices:
    - description: "Extract a URL"
      text: "text http://google.com"
      expected:
        - url: "http://google.com"
          indices: [5, 22]

    - description: "Extract a URL from a Japanese tweet"
      text: "皆さん見てください! http://google.com"
      expected:
        - url: "http://google.com"
          indices: [11, 28]

    - description: "Extract URLs without protocol on ccTLD with slash"
      text: "t.co/abcde bit.ly/abcde"
      expected:
        - url: "t.co/abcde"
          indices: [0, 10]
        - url: "bit.ly/abcde"
          indices: [11, 23]

    - description: "Extract URLs without protocol surrounded by CJK characters"
      text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde"
      expected:
        - url: "twitter.com"
          indices: [0, 11]
        - url: "example.com"
          indices: [20, 31]
        - url: "t.co/abcde"
          indices: [34, 44]
        - url: "twitter.com"
          indices: [46, 57]
        - url: "example2.com"
          indices: [58, 70]
        - url: "twitter.com/abcde"
          indices: [73, 90]

    - description: "Extract URLs with and without protocol surrounded by CJK characters"
      text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde"
      expected:
        - url: "http://twitter.com/"
          indices: [0, 19]
        - url: "example.com"
          indices: [28, 39]
        - url: "http://t.co/abcde"
          indices: [42, 59]
        - url: "twitter.com"
          indices: [61, 72]
        - url: "example2.com"
          indices: [75, 87]
        - url: "http://twitter.com/abcde"
          indices: [90, 114]

    - description: "Extract t.co URLs skipping trailing characters and adjusting indices correctly"
      text: "http://t.co/pbY2NfTZ's http://t.co/2vYHpAc5; http://t.co/ulYGBYSo: http://t.co/8MkmHU0k+c http://t.co/TKLp64dY.x http://t.co/8t7G3ddS#a http://t.co/FNkPfmii-"
      expected:
        - url: "http://t.co/pbY2NfTZ"
          indices: [0, 20]
        - url: "http://t.co/2vYHpAc5"
          indices: [23, 43]
        - url: "http://t.co/ulYGBYSo"
          indices: [45, 65]
        - url: "http://t.co/8MkmHU0k"
          indices: [67, 87]
        - url: "http://t.co/TKLp64dY"
          indices: [90, 110]
        - url: "http://t.co/8t7G3ddS"
          indices: [113, 133]
        - url: "http://t.co/FNkPfmii"
          indices: [136, 156]

    - description: "Properly extract URL that contains t.co in referer"
      text: "http://www.foo.com?referer=https://t.co/abcde http://t.co/xyzzy"
      expected:
        - url: "http://www.foo.com?referer=https://t.co/abcde"
          indices: [0, 45]
        - url: "http://t.co/xyzzy"
          indices: [46, 63]

    - description: "Extract correct indices for duplicate instances of the same URL"
      text: "http://t.co http://t.co"
      expected:
        - url: "http://t.co"
          indices: [0, 11]
        - url: "http://t.co"
          indices: [12, 23]

    - description: "Extract I18N URL"
      text: "test http://xn--ls8h.XN--ls8h.la/"
      expected:
        - url: "http://xn--ls8h.XN--ls8h.la/"
          indices: [5, 33]

    - description: "Extract URLs with IDN(not encoded)"
      text: "test http://foobar.みんな/ http://foobar.中国/ http://foobar.پاکستان/ "
      expected:
        - url: "http://foobar.みんな/"
          indices: [5, 23]
        - url: "http://foobar.中国/"
          indices: [24, 41]
        - url: "http://foobar.پاکستان/"
          indices: [42, 64]

  hashtags:
    - description: "Extract hashtag after emoji without variant selector (uFE0E or uFE0F)"
      text: "a ✌#hashtag here"
      expected: ["hashtag"]

    - description: "Extract hashtag after emoji with variant selector FE0E"
      text: "a ✌︎#hashtag here"
      expected: ["hashtag"]

    - description: "Extract hashtag after emoji with variant selector FE0F"
      text: "a ✌️#hashtag here"
      expected: ["hashtag"]

    - description: "Extract hashtag after emoji with skin tone without variant selector (FE0E or FE0F)"
      text: "a ✌🏿#hashtag here"
      expected: ["hashtag"]

    - description: "Extract hashtag after emoji with skin tone with variant selector FE0F"
      text: "a ✌🏿️#hashtag here"
      expected: ["hashtag"]

    - description: "Extract hashtag after emoji with zero-width-joiner"
      text: "a 👨‍👩‍👧#hashtag here"
      expected: ["hashtag"]

    - description: "Extract an all-alpha hashtag"
      text: "a #hashtag here"
      expected: ["hashtag"]

    - description: "Extract a letter-then-number hashtag"
      text: "this is #hashtag1"
      expected: ["hashtag1"]

    - description: "Extract a number-then-letter hashtag"
      text: "#1hashtag is this"
      expected: ["1hashtag"]

    - description: "DO NOT Extract an all-numeric hashtag"
      text: "On the #16 bus"
      expected: []

    - description: "DO NOT Extract a single numeric hashtag"
      text: "#0"
      expected: []

    - description: "Extract hashtag after bracket"
      text: "(#hashtag1 )#hashtag2 [#hashtag3 ]#hashtag4 ’#hashtag5’#hashtag6"
      expected: ["hashtag1", "hashtag2", "hashtag3", "hashtag4", "hashtag5", "hashtag6"]

    - description: "Extract a hashtag containing ñ"
      text: "I'll write more tests #mañana"
      expected: ["mañana"]

    - description: "Extract a hashtag containing é"
      text: "Working remotely #café"
      expected: ["café"]

    - description: "Extract a hashtag containing ü"
      text: "Getting my Oktoberfest on #münchen"
      expected: ["münchen"]

    - description: "DO NOT Extract a hashtag containing Japanese"
      text: "this is not valid: # 会議中 ハッシュ"
      expected: []

    - description: "Extract a hashtag in Korean"
      text: "What is #트위터 anyway?"
      expected: ["트위터"]

    - description: "Extract a half-width Hangul hashtag"
      text: "Just random half-width Hangul #ᆪᆭᄚ"
      expected: ["ᆪᆭᄚ"]

    - description: "Extract a hashtag in Russian"
      text: "What is #ашок anyway?"
      expected: ["ашок"]

    - description: "Extract a starting katakana hashtag"
      text: "#カタカナ is a hashtag"
      expected: ["カタカナ"]

    - description: "Extract a starting hiragana hashtag"
      text: "#ひらがな FTW!"
      expected: ["ひらがな"]

    - description: "Extract a starting kanji hashtag"
      text: "#漢字 is the future"
      expected: ["漢字"]

    - description: "Extract a trailing katakana hashtag"
      text: "Hashtag #カタカナ"
      expected: ["カタカナ"]

    - description: "Extract a trailing hiragana hashtag"
      text: "Japanese hashtags #ひらがな"
      expected: ["ひらがな"]

    - description: "Extract a trailing kanji hashtag"
      text: "Study time #漢字"
      expected: ["漢字"]

    - description: "Extract a central katakana hashtag"
      text: "See my #カタカナ hashtag?"
      expected: ["カタカナ"]

    - description: "Extract a central hiragana hashtag"
      text: "Study #ひらがな for fun and profit"
      expected: ["ひらがな"]

    - description: "Extract a central kanji hashtag"
      text: "Some say #漢字 is the past. what do they know?"
      expected: ["漢字"]

    - description: "Extract a Kanji/Katakana mixed hashtag"
      text: "日本語ハッシュタグテスト #日本語ハッシュタグ"
      expected: ["日本語ハッシュタグ"]

    - description: "Extract a hashtag after a punctuation"
      text: "日本語ハッシュテスト。#日本語ハッシュタグ"
      expected: ["日本語ハッシュタグ"]

    - description: "DO NOT include a punctuation in a hashtag"
      text: "#日本語ハッシュタグ。"
      expected: ["日本語ハッシュタグ"]

    - description: "Extract a full-width Alnum hashtag"
      text: "全角英数字ハッシュタグ #hashtag123"
      expected: ["hashtag123"]

    - description: "DO NOT extract a hashtag without a preceding space"
      text: "日本語ハッシュタグ#日本語ハッシュタグ"
      expected: []

    - description: "Hashtag with chouon"
      text: "長音ハッシュタグ。#サッカー"
      expected: ["サッカー"]

    - description: "Hashtag with half-width chouon"
      text: "長音ハッシュタグ。#サッカー"
      expected: ["サッカー"]

    - description: "Hashtag with half-widh voiced sounds marks"
      text: "#ハッシュタグ #パピプペポ"
      expected: ["ハッシュタグ", "パピプペポ"]

    - description: "Hashtag with half-width # after full-width !"
      text: "できましたよー!#日本語ハッシュタグ。"
      expected: ["日本語ハッシュタグ"]

    - description: "Hashtag with full-width # after full-width !"
      text: "できましたよー!#日本語ハッシュタグ。"
      expected: ["日本語ハッシュタグ"]

    - description: "Hashtag with ideographic iteration mark"
      text: "#云々 #学問のすゝめ #いすゞ #各〻 #各〃"
      expected: ["云々", "学問のすゝめ", "いすゞ", "各〻", "各〃"]

    - description: "Extract hashtag with fullwidth tilde"
      text: "#メ~テレ ハッシュタグ内で~が認識されず"
      expected: ["メ~テレ"]

    - description: "Extract hashtag with wave dash"
      text: "#メ〜テレ ハッシュタグ内で~が認識されず"
      expected: ["メ〜テレ"]

    - description: "Hashtags with ş (U+015F)"
      text: "Here’s a test tweet for you: #Ateş #qrşt #ştu #ş"
      expected: ["Ateş", "qrşt", "ştu", "ş"]

    - description: "Hashtags with İ (U+0130) and ı (U+0131)"
      text: "Here’s a test tweet for you: #İn #ın"
      expected: ["İn", "ın"]

    - description: "Hashtag before punctuations"
      text: "#hashtag: #hashtag; #hashtag, #hashtag. #hashtag! #hashtag?"
      expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"]

    - description: "Hashtag after punctuations"
      text: ":#hashtag ;#hashtag ,#hashtag .#hashtag !#hashtag ?#hashtag"
      expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"]

    - description: "Hashtag before newline"
      text: "#hashtag\ntest\n#hashtag2\ntest\n#hashtag3\n"
      expected: ["hashtag", "hashtag2", "hashtag3"]

    - description: "DO NOT extract hashtag when # is followed by URL"
      text: "#http://twitter.com #https://twitter.com"
      expected: []

    - description: "DO NOT extract hashtag if it's a part of URL"
      text: "http://twitter.com/#hashtag twitter.com/#hashtag"
      expected: []

    - description: "Extract hashtags with Latin extended characters"
      text: "#Azərbaycanca #mûǁae #Čeština #Ċaoiṁín"
      expected: ["Azərbaycanca", "mûǁae", "Čeština", "Ċaoiṁín"]

    - description: "Extract Arabic hashtags"
      text: "#سیاست #ایران #السياسة #السياح #لغات  #اتمی  #کنفرانس #العربية #الجزيرة #فارسی"
      expected: ["سیاست", "ایران", "السياسة", "السياح", "لغات", "اتمی", "کنفرانس", "العربية", "الجزيرة", "فارسی"]

    - description: "Extract Arabic hashtags with underscore"
      text: "#برنامه_نویسی  #رییس_جمهور  #رئيس_الوزراء, #ثبت_نام. #لس_آنجلس"
      expected: ["برنامه_نویسی", "رییس_جمهور", "رئيس_الوزراء", "ثبت_نام", "لس_آنجلس"]

    - description: "Extract Hebrew hashtags"
      text: "#עַל־יְדֵי #וכו׳ #מ״כ"
      expected: ["עַל־יְדֵי", "וכו׳", "מ״כ"]

    - description: "Extract Thai hashtags"
      text: "#ผู้เริ่ม #การเมือง #รายละเอียด #นักท่องเที่ยว #ของขวัญ #สนามบิน #เดินทาง #ประธาน"
      expected: ["ผู้เริ่ม", "การเมือง", "รายละเอียด", "นักท่องเที่ยว", "ของขวัญ", "สนามบิน", "เดินทาง", "ประธาน"]

    - description: "Extract Arabic hashtags with Zero-Width Non-Joiner"
      text: "#أي‌بي‌إم #می‌خواهم"
      expected: ["أي‌بي‌إم", "می‌خواهم"]

    - description: "Extract Amharic hashtag"
      text: "የአላህ መልእክተኛ ሰለላሁ ዓለይሂ ወሰለም #ኢትዮሙስሊምስ"
      expected: ["ኢትዮሙስሊምስ"]

    - description: "Extract Sinhala hashtag with Zero-Width Joiner (U+200D)"
      text: "#ශ්‍රීලංකා"
      expected: ["ශ්‍රීලංකා"]

    - description: "Extract Arabic and Persian hashtags with numbers"
      text: "#۳۴۵هشتگ #هشتگ۶۷۸ #ســـلام_عليكم_٤٠٦"
      expected: ["۳۴۵هشتگ","هشتگ۶۷۸","ســـلام_عليكم_٤٠٦"]

    - description: "Extract Hindi hashtags"
      text: "#महात्मा #महात्मा_१२३४ #१२३४ गांधी"
      expected: ["महात्मा","महात्मा_१२३४"]

    - description: "Extract Indic script hashtags"
      text: "#বাংলা #ગુજરાતી #ಕನ್ನಡ #മലയാളം #ଓଡ଼ିଆ #ਪੰਜਾਬੀ #සිංහල #தமிழ் #తెలుగు"
      expected: ["বাংলা","ગુજરાતી","ಕನ್ನಡ","മലയാളം","ଓଡ଼ିଆ","ਪੰਜਾਬੀ","සිංහල","தமிழ்","తెలుగు"]

    - description: "Extract Tibetan hashtags"
      text: "#བོད་སྐད་ #བོད་སྐད"
      expected: ["བོད་སྐད་","བོད་སྐད"]

    - description: "Extract Khmer, Burmese, Laotian hashtags"
      text: "#មហាត្មះគន្ធី #မြင့်မြတ်သော #ຊີວະສາດ"
      expected: ["មហាត្មះគន្ធី","မြင့်မြတ်သော","ຊີວະສາດ"]

    - description: "Extract Greek hashtag"
      text: "#Μαχάτμα_Γκάντι ήταν Ινδός πολιτικός"
      expected: ["Μαχάτμα_Γκάντι"]

    - description: "Extract Armenian and Georgian hashtags"
      text: "#Մահաթմա #მაჰათმა"
      expected: ["Մահաթմա","მაჰათმა"]

    - description: "Extract hashtag with middle dot"
      text: "#il·lusió"
      expected: ["il·lusió"]

    - description: "DO NOT extract hashtags without a letter"
      text: "#_ #1_2 #122 #〃"
      expected: []

  hashtags_from_astral:
    - description: "Extract hashtag with letter from astral plane (U+20021)"
      text: "#\U00020021"
      expected: ["\U00020021"]

    - description: "Extract hashtag with letter plus marker from astral plane (U+16f04 U+16f51)"
      text: "#\U00016f04\U00016f51"
      expected: ["\U00016f04\U00016f51"]

    - description: "Extract hashtag with letter plus number from astral plane (U+104a0)"
      text: "#\U00000041\U000104a0"
      expected: ["A\U000104a0"]

  hashtags_with_indices:
    - description: "Extract a hastag at the start"
      text: "#hashtag here"
      expected:
        - hashtag: "hashtag"
          indices: [0, 8]

    - description: "Extract a hastag at the end"
      text: "test a #hashtag"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract a hastag in the middle"
      text: "test a #hashtag in a string"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract only a valid hashtag"
      text: "#123 a #hashtag in a string"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract a hashtag in a string of multi-byte characters"
      text: "会議中 #hashtag 会議中"
      expected:
        - hashtag: "hashtag"
          indices: [4, 12]

    - description: "Extract multiple valid hashtags"
      text: "One #two three #four"
      expected:
        - hashtag: "two"
          indices: [4, 8]
        - hashtag: "four"
          indices: [15, 20]

    - description: "Extract a non-latin hashtag"
      text: "Hashtags in #русский!"
      expected:
        - hashtag: "русский"
          indices: [12, 20]

    - description: "Extract multiple non-latin hashtags"
      text: "Hashtags in #中文, #日本語, #한국말, and #русский! Try it out!"
      expected:
        - hashtag: "中文"
          indices: [12, 15]
        - hashtag: "日本語"
          indices: [17, 21]
        - hashtag: "한국말"
          indices: [23, 27]
        - hashtag: "русский"
          indices: [33, 41]

  cashtags:
    - description: "Extract cashtags"
      text: "Example cashtags: $TEST $Stock   $symbol"
      expected: ["TEST", "Stock", "symbol"]

    - description: "Extract cashtags with . or _"
      text: "Example cashtags: $TEST.T $test.tt $Stock_X $symbol_ab"
      expected: ["TEST.T", "test.tt", "Stock_X", "symbol_ab"]

    - description: "Do not extract cashtags if they contain numbers"
      text: "$123 $test123 $TE123ST"
      expected: []

    - description: "Do not extract cashtags with non-ASCII characters"
      text: "$ストック $株"
      expected: []

    - description: "Do not extract cashtags with punctuations"
      text: "$ $. $- $@ $! $() $+"
      expected: []

    - description: "Do not include trailing . or _"
      text: "$TEST. $TEST_"
      expected: ["TEST", "TEST"]

    - description: "Do not extract cashtags if there is no space before $"
      text: "$OK$NG$BAD text$NO .$NG $$NG"
      expected: ["OK"]

    - description: "Do not extract too long cashtags"
      text: "$CashtagMustBeLessThanSixCharacter"
      expected: []

  cashtags_with_indices:
    - description: "Extract cashtags"
      text: "Example: $TEST $symbol test"
      expected:
        - cashtag: "TEST"
          indices: [9, 14]
        - cashtag: "symbol"
          indices: [15, 22]

    - description: "Extract cashtags with . or _"
      text: "Example: $TEST.T test $symbol_ab end"
      expected:
        - cashtag: "TEST.T"
          indices: [9, 16]
        - cashtag: "symbol_ab"
          indices: [22, 32]




© 2015 - 2024 Weber Informatics LLC | Privacy Policy