All Downloads are FREE. Search and download functionalities are using the official Maven repository.

amazon.x-asin.sql Maven / Gradle / Ivy

The newest version!
-- noinspection SqlDialectInspectionForFile
-- noinspection SqlResolveForFile
-- noinspection SqlNoDataSourceInspectionForFile

select
    dom_uri(dom) as `url`,
    dom_base_uri(dom) as `base_Uri`,
    dom_first_text(dom, '#productTitle') as `title`,

    -- category
    str_substring_after(dom_first_href(dom, '#wayfinding-breadcrumbs_container ul li:last-child a'), '&node=') as `category_node_id`,
    cast(dom_all_hrefs(dom, '#wayfinding-breadcrumbs_container ul li a') as varchar) as `category_link_path`,
    dom_first_text(dom, '#wayfinding-breadcrumbs_container ul li:last-child a') as `category_name`,
    array_join_to_string(dom_all_texts(dom, '#wayfinding-breadcrumbs_container ul li a'), ' -:> ') as `category_name_path`,

    -- Q.A.
    str_first_integer(dom_first_text(dom, '#askATFLink, .askTopQandALoadMoreQuestions a'), 0) as `qa_num`,
    -- Reviews
    str_first_integer(dom_first_text(dom, '#acrCustomerReviewText, #reviewsMedley div[data-hook=total-review-count] span, #reviewsMedley span:contains(ratings), #reviewsMedley span:contains(customer ratings)'), 0) as `reviews`,

    -- brand
    dom_first_minimal_html(dom, 'div#centerCol a#bylineInfo') as `brand_html`,
    dom_first_attr(dom, 'div#centerCol a#bylineInfo', 'abs:href') as `brand_link`,
    dom_first_text(dom, 'div#centerCol a#bylineInfo') as `brand`,

    -- price
    dom_first_minimal_html(dom, '#price, #apex_desktop, div[data-feature-name=apex_desktop]') as `price_info_html`,
    dom_first_text(dom, '#price, #apex_desktop, div[data-feature-name=apex_desktop]') as `price_info_text`,
    dom_first_text(dom, '#apex_desktop span.basisPrice:contains(List Price)') as `list_price_info_text`,
    dom_all_texts(dom, '#apex_desktop span.basisPrice:contains(List Price) span:containsOwn($)') as `list_price`,
    dom_first_text(dom, '#apex_desktop .priceToPay span, #price tr td:matches(^Price) ~ td') as `price`,
    dom_first_text(dom, '#priceblock_dealprice, #price tr td:contains(Deal of the Day) ~ td, #apex_desktop tr td:contains(Deal of the Day) ~ td') as `with_deal`,
    dom_first_text(dom, '#apex_desktop .savingsPercentage, #dealprice_savings .priceBlockSavingsString, #price tr td:contains(You Save) ~ td') as `you_save`,

    -- product images
    dom_first_minimal_html(dom, '#landingImage, #imgTagWrapperId img, #imageBlock img:expr(width>400)') as `main_img`,
    dom_first_attr(dom, '#landingImage, #imgTagWrapperId img, #imageBlock img:expr(width>400)', 'data-old-hires') as `main_img_src`,
    dom_first_attr(dom, '#landingImage, #imgTagWrapperId img, #imageBlock img:expr(width>400)', 'data-a-dynamic-image') as `dynamic_img_srcs`,
    cast(dom_all_minimal_htmls(dom, '#imageBlock img') as varchar) as `gallery_html`,
    cast(dom_all_attrs(dom, '#imageBlock img', 'src') as varchar) as `gallery_links`,

    -- buy box
    dom_first_minimal_html(dom, ' #buybox, #desktop_buybox, #qualifiedBuybox, div[data-feature-name=desktop_buybox]') as `buy_box_html`,
    dom_first_text(dom, '#buybox, #desktop_buybox, #qualifiedBuybox, div[data-feature-name=desktop_buybox]') as `buy_box_text`,

    dom_first_text(dom, ' #qualifiedBuybox .a-price,' ||
                        ' #corePrice_feature_div,' ||
                        ' div[data-feature-name=corePrice]') as `buy_box_price`,

    -- seller info
    dom_all_texts(dom, ' a#sellerProfileTriggerId[href~=seller],' ||
                       ' #tabular-buybox tr:has(td:contains(Sold by)) td a[href~=seller],' ||
                       ' #usedbuyBox div:contains(Sold by) a[href~=seller],' ||
                       ' #merchant-info a[href~=seller],' ||
                       ' #tabular-buybox a[href~=seller],' ||
                       ' #buybox-tabular a[href~=seller]' ||
                       ' div.tabular-buybox-text[tabular-attribute-name*="Sold by"]'
        ) as `buy_box_sold_by`,
    dom_all_hrefs(dom, ' a#sellerProfileTriggerId[href~=seller],' ||
                       ' #tabular-buybox tr:has(td:contains(Sold by)) td a[href~=seller],' ||
                       ' #usedbuyBox div:contains(Sold by) a[href~=seller],' ||
                       ' #merchant-info a[href~=seller],' ||
                       ' #buybox-tabular a[href~=seller]'
        ) as `buy_box_seller_id`,
    dom_all_hrefs(dom, ' a#sellerProfileTriggerId[href~=seller],' ||
                       ' #tabular-buybox tr:has(td:contains(Sold by)) td a[href~=seller],' ||
                       ' #usedbuyBox div:contains(Sold by) a[href~=seller],' ||
                       ' #merchant-info a[href~=seller],' ||
                       ' #buybox-tabular a[href~=seller]'
        ) as `buy_box_marketplace_id`,
    dom_all_texts(dom, ' #tabular-buybox div.tabular-buybox-text[tabular-attribute-name=Ships from],' ||
                       ' #tabular-buybox tr:has(td:contains(Ships from)) td,' ||
                       ' #buybox-tabular tr:has(td:contains(Ships from)) td' ||
                       ' div.tabular-buybox-text[tabular-attribute-name*="Ships from"]'
        ) as `buy_box_ships_from`,
    dom_all_texts(dom, ' #tabular-buybox div.tabular-buybox-text[tabular-attribute-name=Dispatches from],' ||
                       ' #tabular-buybox tr:has(td:contains(Dispatches from)) td,' ||
                       ' #buybox-tabular tr:has(td:contains(Dispatches from)) td' ||
                       ' div.tabular-buybox-text[tabular-attribute-name=Dispatches from]'
        ) as `buy_box_dispatches_from`,

    -- other sellers, other sellers shows if `New (X) Seller` anchor clicked
    dom_first_text(dom, '#olp-upd-new-used a, #olp-upd-new a, .olp-text-box span') as `other_seller_text`,
    str_substring_between(dom_first_text(dom, '#olp-upd-new-used a, #olp-upd-new a, .olp-text-box span'), '(', ')') as `other_seller_num`,
    dom_all_hrefs(dom, '#aod-offer a[href*=seller]') as `other_seller_urls`,

    -- variations
    dom_first_minimal_html(dom, '#twister_feature_div, div[data-feature-name=twister], #variation_color_name') as `variations_html`,
    dom_first_text(dom, '#twister_feature_div, div[data-feature-name=twister], #variation_color_name') as `variations_text`,

    -- special flags
    str_is_not_empty(dom_first_text(dom, '#acBadge_feature_div i:contains(Best Seller)')) as `is_best_seller`,
    str_is_not_empty(dom_first_text(dom, '#acBadge_feature_div span:contains(Amazon),' ||
                                         ' div[data-feature-name=acBadge] span:contains(Amazon)')) as `is_amazon_choice`,
    str_is_not_empty(dom_first_text(dom, '#centerCol #couponBadgeRegularVpc')) as `is_coupon`,
    dom_first_text(dom, '#centerCol div i:contains(Prime Day Deal)') as `is_prime`,

    dom_first_text(dom, '#glow-ingress-block') as `global_deliver_to`,
    dom_first_text(dom, '#contextualIngressPtLabel_deliveryShortLine,' ||
                        ' #glowContextualIngressPt_feature_div,' ||
                        ' div[data-feature-name=glowContextualIngressPt]') as `deliver_to`,
    to_json(map(
       'globaldeliverto', dom_first_text(dom, '#glow-ingress-block'),
       'deliverto', dom_first_text(dom, '#contextualIngressPtLabel_deliveryShortLine,' ||
                                        ' #glowContextualIngressPt_feature_div,' ||
                                        ' div[data-feature-name=glowContextualIngressPt]')
    )) as `combined_deliver_to`,

    str_abbreviate(dom_first_text(dom, '#availability, #outOfStock'), 1024) as `in_stock`,
    dom_first_text(dom, '#selectQuantity select option:last-child') as `quantity`,
    dom_all_hrefs(dom, ' #availability a,' ||
                       ' #olp-upd-new-used a,' ||
                       ' #olp-upd-new a,' ||
                       ' #certified-refurbished-version a[href~=/dp/],' ||
                       ' #newer-version a[href~=/dp/]') as `sell_same_url`,

    array_join_to_string(dom_all_attrs(dom, '#sims-fbt #sims-fbt-content ul li[data-p13n-asin-metadata]', 'data-p13n-asin-metadata'), '^|^') as `bought_together_metadata`,
    array_join_to_string(dom_all_attrs(dom, '#sims-fbt #sims-fbt-content ul li[class~=sims-fbt-image] img', 'src'), '^|^') as `bought_to_gether_imgs`,
    array_join_to_string(dom_all_texts(dom, '#sims-fbt #sims-fbt-content ul li[data-p13n-asin-metadata]'), '^|^') as `bought_together`,

    str_is_not_empty(dom_first_text(dom, '#addToCart_feature_div span:contains(Add to Cart), #submit.add-to-cart-ubb-announce')) as `has_add_cart`,
    str_is_not_empty(dom_first_text(dom, '#buyNow span:contains(Buy now)')) as `has_buy`,

    -- product overview
    cast(dom_all_minimal_htmls(dom, '#productOverview_feature_div table,' ||
                                    ' div[data-feature-name=productOverview] table') as varchar) as `product_overview_html`,
    cast(dom_all_texts(dom, '#productOverview_feature_div table tr,' ||
                            ' div[data-feature-name=productOverview] table tr') as varchar) as `product_overview`,
    dom_first_text(dom, '#productOverview_feature_div table tr td:contains(Brand) ~ td,' ||
                        ' div[data-feature-name=productOverview] table tr td:contains(Brand) ~ td') as `product_overview_brand`,

    -- feature bullets
    cast(dom_all_minimal_htmls(dom, '#feature-bullets ul li, div[data-feature-name=featurebullets] ul li') as varchar) as `feature_bullets_html`,
    cast(dom_all_texts(dom, '#feature-bullets ul li, div[data-feature-name=featurebullets] ul li') as varchar) as `feature_bullets`,

    -- product information
    -- technical details
    dom_first_minimal_html(dom, ' table#productDetails_techSpec_section_1,' ||
                                ' table.prodDetTable') as `technical_details_html`,
    dom_first_text(dom, ' table#productDetails_techSpec_section_1,' ||
                        ' table.prodDetTable') as `technical_details_text`,
    dom_all_texts(dom, ' table#productDetails_techSpec_section_1 tr,' ||
                       ' table.prodDetTable tr') as `technical_details`,
    dom_first_text(dom, ' table#productDetails_techSpec_section_1 tr th:contains(Brand) ~ td,' ||
                        ' table.prodDetTable tr th:contains(Brand) ~ td') as `technical_details_brand`,
    dom_first_text(dom, ' table#productDetails_techSpec_section_1 tr th:contains(Manufacturer) ~ td,' ||
                        ' table.prodDetTable tr th:contains(Manufacturer) ~ td') as `technical_details_manufacturer`,
    dom_first_text(dom, ' #prodDetails table tr > th:contains(Product Dimensions) ~ td,' ||
                        ' #detailBullets_feature_div ul li span:contains(Package Dimensions) ~ span') as `technical_details_detail_volume`,
    dom_first_text(dom, ' #prodDetails table tr > th:contains(Item Weight) ~ td') as `technical_details_detail_weight`,

    -- additional information
    dom_first_minimal_html(dom, ' table#productDetails_detailBullets_sections1,' ||
                                ' #productDetails_db_sections table.prodDetTable') as `additional_info_html`,
    dom_first_text(dom, ' table#productDetails_detailBullets_sections1,' ||
                        ' #productDetails_db_sections table.prodDetTable') as `additional_info_text`,
    dom_all_texts(dom, ' table#productDetails_detailBullets_sections1 tr,' ||
                       ' #productDetails_db_sections table.prodDetTable tr') as `additional_info`,
    dom_first_text(dom, ' table#productDetails_detailBullets_sections1 tr th:contains(ASIN) ~ td,' ||
                        ' #productDetails_db_sections table.prodDetTable tr th:contains(ASIN) ~ td') as `additional_info_asin`,
    dom_first_minimal_html(dom, ' #prodDetails table tr > th:contains(Best Sellers Rank) ~ td,' ||
                                ' #detailBullets_feature_div ul li span:contains(Best Sellers Rank)') as `additional_info_rank_html`,
    dom_first_text(dom, ' #prodDetails table tr > th:contains(Best Sellers Rank) ~ td,' ||
                        ' #detailBullets_feature_div ul li span:contains(Best Sellers Rank)') as `additional_info_rank_text`,
    dom_first_text(dom, ' #prodDetails table tr > th:contains(Date First) ~ td,' ||
                        ' #detailBullets_feature_div ul li span:contains(Date First Available) ~ span') as `additional_info_on_sale_time`,

    cast(dom_all_attrs(dom, ' #prodDetails img[src], #productDescription img[src],' ||
                            ' #dpx-aplus-product-description_feature_div img[src],' ||
                            ' #dpx-aplus-3p-product-description_feature_div img[src]', 'src') as varchar) as `detail_imgs`,
    cast(dom_all_hrefs(dom, '#rvs-vse-related-videos ol li a[href~=/vdp/]') as varchar) as `detail_videos`,

    dom_first_text(dom, ' #prodDetails table tr > th:contains(ASIN) ~ td,' ||
                        ' #detailBullets_feature_div ul li span:contains(ASIN) ~ span') as `detail_asin`,
    cast(dom_all_minimal_htmls(dom, '#prodDetails h1:contains(Feedback) ~ div a') as varchar) as `feedback_url_html`,
    dom_all_attrs(dom, '#prodDetails h1:contains(Feedback) ~ div a', 'abs:src') as `feedback_urls`,

    -- description
    dom_first_minimal_html(dom, 'h2:contains(Product Description) ~ div') as `product_description_html`,
    dom_first_text(dom, 'h2:contains(Product Description) ~ div') as `product_description_text`,
    dom_all_attrs(dom, 'h2:contains(Product Description) ~ div img', 'abs:src') as `product_description_images`,

    array_length(dom_all_imgs(dom, '#prodDetails img[src], #productDescription img[src]')) as `is_a_plus`,

    -- From the brand (A+ Brand Story)
    dom_first_minimal_html(dom, ' #aplusBrandStory_feature_div,' ||
                                ' div[data-feature-name=aplusBrandStory]') as `brand_story_html`,
    dom_first_text(dom, ' #aplusBrandStory_feature_div,' ||
                        ' div[data-feature-name=aplusBrandStory]') as `brand_story_text`,
    dom_all_minimal_htmls(dom, ' #aplusBrandStory_feature_div .apm-brand-story-card a[href~=/dp/],' ||
                               ' div[data-feature-name=aplusBrandStory] .apm-brand-story-card a[href~=/dp/]') as `brand_story_product_htmls`,
    dom_all_attrs(dom, ' #aplusBrandStory_feature_div .apm-brand-story-card a[href~=/dp/] img,' ||
                       ' div[data-feature-name=aplusBrandStory] .apm-brand-story-card a[href~=/dp/] img', 'alt') as `brand_story_product_names`,
    dom_all_attrs(dom, ' #aplusBrandStory_feature_div .apm-brand-story-card a[href~=/dp/],' ||
                       ' div[data-feature-name=aplusBrandStory] .apm-brand-story-card a[href~=/dp/]', 'abs:href') as `brand_story_product_links`,
    dom_all_attrs(dom, ' #aplusBrandStory_feature_div .apm-brand-story-card a[href~=/dp/] img,' ||
                       ' div[data-feature-name=aplusBrandStory] .apm-brand-story-card a[href~=/dp/] img', 'abs:src') as `brand_story_product_images`,

    cast(dom_all_texts(dom, '#reviewsMedley div[data-hook=lighthut-terms-list] a, #reviewsMedley h3:contains(Read reviews that mention) ~ div a') as varchar) as `reviews_mention`,
    str_first_float(dom_first_text(dom, '#reviewsMedley .AverageCustomerReviews span:contains(out of)'), 0.0) as `score`,
    str_first_integer(dom_first_text(dom, '#reviewsMedley div[data-hook=total-review-count], #acrCustomerReviewText'), 0) as `star_num`,
    dom_first_text(dom, 'table#histogramTable:expr(width > 100) td:contains(5 star) ~ td:contains(%)') as `score_5_percent`,
    dom_first_text(dom, 'table#histogramTable:expr(width > 100) td:contains(4 star) ~ td:contains(%)') as `score_4_percent`,
    dom_first_text(dom, 'table#histogramTable:expr(width > 100) td:contains(3 star) ~ td:contains(%)') as `score_3_percent`,
    dom_first_text(dom, 'table#histogramTable:expr(width > 100) td:contains(2 star) ~ td:contains(%)') as `score_2_percent`,
    dom_first_text(dom, 'table#histogramTable:expr(width > 100) td:contains(1 star) ~ td:contains(%)') as `score_1_percent`,
    dom_all_texts(dom, 'div#cr-dp-summarization-attributes div[data-hook=cr-summarization-attribute]') as `scores_by_feature`,
    dom_first_href(dom, '#reviews-medley-footer a') as `reviews_url`,

    dom_first_attr(dom_owner_document(dom), 'head meta[name=keywords]', 'content') as `meta_keywords`,
    dom_first_attr(dom_owner_document(dom), 'head meta[name=description]', 'content') as `meta_description`,

    -- variables from javascript
    dom_first_text(dom_owner_body(dom), '#pulsarJsVariables pre.jsVariables') as `js_variables`,
    dom_first_text(dom_owner_body(dom), '#pulsarJsVariables pre.parentAsin') as `js_pasin`,
    dom_first_text(dom_owner_body(dom), '#pulsarJsVariables pre.variationValues') as `js_variation_values`,
    dom_first_text(dom_owner_body(dom), '#pulsarJsVariables pre.asinVariationValues') as `js_asin_variation_values`,
    dom_first_text(dom_owner_body(dom), '#pulsarJsVariables pre.num_total_variations') as `js_num_total_variations`,

    -- variables from input
    dom_all_attrs(dom, 'input[type=hidden]', 'name') as `input_names`,
    dom_all_attrs(dom, 'input[type=hidden]', 'value') as `input_values`,
    dom_all_multi_attrs(dom, 'input[type=hidden]', make_array('name', 'value')) as `input_properties`,
    dom_first_attr(dom, 'input[name=asin]', 'value') as `input_asin`,
    dom_first_attr(dom, 'input[name=marketplaceId]', 'value') as `input_marketplace_id`,
    dom_first_attr(dom, 'input[name=productTitle]', 'value') as `input_product_title`,
    dom_first_attr(dom, 'input[name=priceSymbol]', 'value') as `input_price_symbol`,
    dom_first_attr(dom, 'input[name=priceValue]', 'value') as `input_price_value`,
    dom_first_attr(dom, 'input[name=productImageUrl]', 'value') as `input_product_image_url`,

    -- variables from metadata
    dom_first_attr(dom, '#PulsarMetaInformation', 'href') as `meta_href`,
    dom_first_attr(dom, '#PulsarMetaInformation', 'referer') as `meta_referer`,
    dom_first_attr(dom, '#PulsarMetaInformation', 'referrer') as `meta_referrer`,
    dom_first_attr(dom, '#PulsarMetaInformation', 'label') as `meta_label`,
    time_first_mysql_date_time(dom_attr(dom_select_first(dom_owner_body(dom), '#PulsarMetaInformation'), 'taskTime')) as `meta_task_time`,

    -- all feature names defined by amazon
    dom_all_attrs(dom, '*[data-feature-name]', 'data-feature-name') as `amazon_feature_names`,

    -- dom features calculated by PulsarRPA
    dom_ch(dom_owner_body(dom)) as `pulsar_num_chars`,
    dom_a(dom_owner_body(dom)) as `pulsar_num_links`,
    dom_img(dom_owner_body(dom)) as `pulsar_num_imgs`,
    dom_height(dom_owner_body(dom)) as `pulsar_height`
from load_and_select(@url, ':root body');




© 2015 - 2024 Weber Informatics LLC | Privacy Policy