CDKGYS7FKMQ66TFMBK4PLCU5DLJKKJMAIHM7PTOW4P7CNSSROE6AC
HHCNP77PCFAYBGYTK2IAECQ5K6CNYGEHV4EYJ5PHGQUSC7BISFVQC
ENKL5IPU7VOUHAGDROOZVFE7GJO3J6Q3ZBSYRTI62PF3XCINGL5AC
JGMCSDW663DQSK7XSWDBPVYQE57ZBP7ZVZLSEXUJOQVE7KY6BB4QC
KO3RU6L7MHCO7HXKBV2HVVA6JGOO35S5WCNKW345EN5LWCPNWTVQC
TOTQ6MUJKZOPDIJ6YGWNZVZNWK7YAOHERPJKQHIQ7GKHFN3BCAPAC
Z4Z4LDCBI24SA46KOL3FOB4YYVC7IE4DHTGXIXBTHA2JVVPHXEJQC
{:meta/version 1
;; Set the preferred format.
;; Available options:
;; - Markdown (default)
;; - Org
;; :preferred-format "Markdown"
;; Set the preferred workflow style.
;; Available options:
;; - :now for NOW/LATER style (default)
;; - :todo for TODO/DOING style
:preferred-workflow :now
;; Exclude directories/files.
;; Example usage:
;; :hidden ["/archived" "/test.md" "../assets/archived"]
:hidden []
;; Define the default journal page template.
;; Enter the template name between the quotes.
:default-templates
{:journals ""}
;; Set a custom date format for the journal page title.
;; Default value: "MMM do, yyyy"
;; e.g., "Jan 19th, 2038"
;; Example usage e.g., "Tue 19th, Jan 2038"
;; :journal/page-title-format "EEE do, MMM yyyy"
;; Specify the journal filename format using a valid date format string.
;; !Warning:
;; This configuration is not retroactive and affects only new journals.
;; To show old journal files in the app, manually rename the files in the
;; journal directory to match the new format.
;; Default value: "yyyy_MM_dd"
;; :journal/file-name-format "yyyy_MM_dd"
;; Enable tooltip preview on hover.
;; Default value: true
:ui/enable-tooltip? true
;; Display brackets [[]] around page references.
;; Default value: true
;; :ui/show-brackets? true
;; Display all lines of a block when referencing ((block)).
;; Default value: false
:ui/show-full-blocks? false
;; Automatically expand block references when zooming in.
;; Default value: true
:ui/auto-expand-block-refs? true
;; Enable Block timestamps.
;; Default value: false
:feature/enable-block-timestamps? false
;; Disable accent marks when searching.
;; After changing this setting, rebuild the search index by pressing (^C ^S).
;; Default value: true
:feature/enable-search-remove-accents? true
;; Enable journals.
;; Default value: true
;; :feature/enable-journals? true
;; Enable flashcards.
;; Default value: true
;; :feature/enable-flashcards? true
;; Enable whiteboards.
;; Default value: true
;; :feature/enable-whiteboards? true
;; Disable the journal's built-in 'Scheduled tasks and deadlines' query.
;; Default value: false
;; :feature/disable-scheduled-and-deadline-query? false
;; Specify the number of days displayed in the future for
;; the 'scheduled tasks and deadlines' query.
;; Example usage:
;; Display all scheduled and deadline blocks for the next 14 days:
;; :scheduled/future-days 14
;; Default value: 7
;; :scheduled/future-days 7
;; Specify the first day of the week.
;; Available options:
;; - integer from 0 to 6 (Monday to Sunday)
;; Default value: 6 (Sunday)
:start-of-week 6
;; Specify a custom CSS import.
;; This option takes precedence over the local `logseq/custom.css` file.
;; Example usage:
;; :custom-css-url "@import url('https://cdn.jsdelivr.net/gh/dracula/logseq@master/custom.css');"
;; Specify a custom JS import.
;; This option takes precedence over the local `logseq/custom.js` file.
;; Example usage:
;; :custom-js-url "https://cdn.logseq.com/custom.js"
;; Set a custom Arweave gateway
;; Default gateway: https://arweave.net
;; :arweave/gateway "https://arweave.net"
;; Set bullet indentation when exporting
;; Available options:
;; - `:eight-spaces` as eight spaces
;; - `:four-spaces` as four spaces
;; - `:two-spaces` as two spaces
;; - `:tab` as a tab character (default)
;; :export/bullet-indentation :tab
;; Publish all pages within the Graph
;; Regardless of whether individual pages have been marked as public.
;; Default value: false
;; :publishing/all-pages-public? false
;; Define the default home page and sidebar status.
;; If unspecified, the journal page will be loaded on startup and the right sidebar will stay hidden.
;; The `:page` value represents the name of the page displayed at startup.
;; Available options for `:sidebar` are:
;; - "Contents" to display the Contents page in the right sidebar.
;; - A specific page name to display in the right sidebar.
;; - An array of multiple pages, e.g., ["Contents" "Page A" "Page B"].
;; If `:sidebar` remains unset, the right sidebar will stay hidden.
;; Examples:
;; 1. Set "Changelog" as the home page and display "Contents" in the right sidebar:
;; :default-home {:page "Changelog", :sidebar "Contents"}
;; 2. Set "Jun 3rd, 2021" as the home page without the right sidebar:
;; :default-home {:page "Jun 3rd, 2021"}
;; 3. Set "home" as the home page and display multiple pages in the right sidebar:
;; :default-home {:page "home", :sidebar ["Page A" "Page B"]}
;; Set the default location for storing notes.
;; Default value: "pages"
;; :pages-directory "pages"
;; Set the default location for storing journals.
;; Default value: "journals"
;; :journals-directory "journals"
;; Set the default location for storing whiteboards.
;; Default value: "whiteboards"
;; :whiteboards-directory "whiteboards"
;; Enabling this option converts
;; [[Grant Ideas]] to [[file:./grant_ideas.org][Grant Ideas]] for org-mode.
;; For more information, visit https://github.com/logseq/logseq/issues/672
;; :org-mode/insert-file-link? false
;; Configure custom shortcuts.
;; Syntax:
;; 1. + indicates simultaneous key presses, e.g., `Ctrl+Shift+a`.
;; 2. A space between keys represents key chords, e.g., `t s` means
;; pressing `t` followed by `s`.
;; 3. mod refers to `Ctrl` for Windows/Linux and `Command` for Mac.
;; 4. Use false to disable a specific shortcut.
;; 5. You can define multiple bindings for a single action, e.g., ["ctrl+j" "down"].
;; The full list of configurable shortcuts is available at:
;; https://github.com/logseq/logseq/blob/master/src/main/frontend/modules/shortcut/config.cljs
;; Example:
;; :shortcuts
;; {:editor/new-block "enter"
;; :editor/new-line "shift+enter"
;; :editor/insert-link "mod+shift+k"
;; :editor/highlight false
;; :ui/toggle-settings "t s"
;; :editor/up ["ctrl+k" "up"]
;; :editor/down ["ctrl+j" "down"]
;; :editor/left ["ctrl+h" "left"]
;; :editor/right ["ctrl+l" "right"]}
:shortcuts {}
;; Configure the behavior of pressing Enter in document mode.
;; if set to true, pressing Enter will create a new block.
;; Default value: false
:shortcut/doc-mode-enter-for-new-block? false
;; Block content larger than `block/content-max-length` will not be searchable
;; or editable for performance.
;; Default value: 10000
:block/content-max-length 10000
;; Display command documentation on hover.
;; Default value: true
:ui/show-command-doc? true
;; Display empty bullet points.
;; Default value: false
:ui/show-empty-bullets? false
;; Pre-defined :view function to use with advanced queries.
:query/views
{:pprint
(fn [r] [:pre.code (pprint r)])}
;; Advanced queries `:result-transform` function.
;; Transform the query result before displaying it.
:query/result-transforms
{:sort-by-priority
(fn [result] (sort-by (fn [h] (get h :block/priority "Z")) result))}
;; The following queries will be displayed at the bottom of today's journal page.
;; The "NOW" query returns tasks with "NOW" or "DOING" status.
;; The "NEXT" query returns tasks with "NOW", "LATER", or "TODO" status.
:default-queries
{:journals
[{:title "🔨 NOW"
:query [:find (pull ?h [*])
:in $ ?start ?today
:where
[?h :block/marker ?marker]
[(contains? #{"NOW" "DOING"} ?marker)]
[?h :block/page ?p]
[?p :block/journal? true]
[?p :block/journal-day ?d]
[(>= ?d ?start)]
[(<= ?d ?today)]]
:inputs [:14d :today]
:result-transform (fn [result]
(sort-by (fn [h]
(get h :block/priority "Z")) result))
:group-by-page? false
:collapsed? false}
{:title "📅 NEXT"
:query [:find (pull ?h [*])
:in $ ?start ?next
:where
[?h :block/marker ?marker]
[(contains? #{"NOW" "LATER" "TODO"} ?marker)]
[?h :block/page ?p]
[?p :block/journal? true]
[?p :block/journal-day ?d]
[(> ?d ?start)]
[(< ?d ?next)]]
:inputs [:today :7d-after]
:group-by-page? false
:collapsed? false}]}
;; Add custom commands to the command palette
;; Example usage:
;; :commands
;; [
;; ["js" "Javascript"]
;; ["md" "Markdown"]
;; ]
:commands []
;; Enable collapsing blocks with titles but no children.
;; By default, only blocks with children can be collapsed.
;; Setting `:outliner/block-title-collapse-enabled?` to true allows collapsing
;; blocks with titles (multiple lines) and content. For example:
;; - block title
;; block content
;; Default value: false
:outliner/block-title-collapse-enabled? false
;; Macros replace texts and will make you more productive.
;; Example usage:
;; Change the :macros value below to:
;; {"poem" "Rose is $1, violet's $2. Life's ordered: Org assists you."}
;; input "{{poem red,blue}}"
;; becomes
;; Rose is red, violet's blue. Life's ordered: Org assists you.
:macros {}
;; Configure the default expansion level for linked references.
;; For example, consider the following block hierarchy:
;; - a [[page]] (level 1)
;; - b (level 2)
;; - c (level 3)
;; - d (level 4)
;;
;; With the default value of level 2, block b will be collapsed.
;; If the level's value is set to 3, block c will be collapsed.
;; Default value: 2
:ref/default-open-blocks-level 2
;; Configure the threshold for linked references before collapsing.
;; Default value: 100
:ref/linked-references-collapsed-threshold 50
;; Graph view configuration.
;; Example usage:
;; :graph/settings
;; {:orphan-pages? true ; Default value: true
;; :builtin-pages? false ; Default value: false
;; :excluded-pages? false ; Default value: false
;; :journal? false} ; Default value: false
;; Graph view configuration.
;; Example usage:
;; :graph/forcesettings
;; {:link-dist 180 ; Default value: 180
;; :charge-strength -600 ; Default value: -600
;; :charge-range 600} ; Default value: 600
;; Favorites to list on the left sidebar
:favorites []
;; Set flashcards interval.
;; Expected value:
;; - Float between 0 and 1
;; higher values result in faster changes to the next review interval.
;; Default value: 0.5
;; :srs/learning-fraction 0.5
;; Set the initial interval after the first successful review of a card.
;; Default value: 4
;; :srs/initial-interval 4
;; Hide specific block properties.
;; Example usage:
;; :block-hidden-properties #{:public :icon}
;; Create a page for all properties.
;; Default value: true
:property-pages/enabled? true
;; Properties to exclude from having property pages
;; Example usage:
;; :property-pages/excludelist #{:duration :author}
;; By default, property value separated by commas will not be treated as
;; page references. You can add properties to enable it.
;; Example usage:
;; :property/separated-by-commas #{:alias :tags}
;; Properties that are ignored when parsing property values for references
;; Example usage:
;; :ignored-page-references-keywords #{:author :website}
;; logbook configuration.
;; :logbook/settings
;; {:with-second-support? false ;limit logbook to minutes, seconds will be eliminated
;; :enabled-in-all-blocks true ;display logbook in all blocks after timetracking
;; :enabled-in-timestamped-blocks false ;don't display logbook at all
;; }
;; Mobile photo upload configuration.
;; :mobile/photo
;; {:allow-editing? true
;; :quality 80}
;; Mobile features options
;; Gestures
;; Example usage:
;; :mobile
;; {:gestures/disabled-in-block-with-tags ["kanban"]}
;; Extra CodeMirror options
;; See https://codemirror.net/5/doc/manual.html#config for possible options
;; Example usage:
;; :editor/extra-codemirror-options
;; {:lineWrapping false ; Default value: false
;; :lineNumbers true ; Default value: true
;; :readOnly false} ; Default value: false
;; Enable logical outdenting
;; Default value: false
;; :editor/logical-outdenting? false
;; Prefer pasting the file when text and a file are in the clipboard.
;; Default value: false
;; :editor/preferred-pasting-file? false
;; Quick capture templates for receiving content from other apps.
;; Each template contains three elements {time}, {text} and {url}, which can be auto-expanded
;; by receiving content from other apps. Note: the {} cannot be omitted.
;; - {time}: capture time
;; - {date}: capture date using current date format, use `[[{date}]]` to get a page reference
;; - {text}: text that users selected before sharing.
;; - {url}: URL or assets path for media files stored in Logseq.
;; You can also reorder them or use only one or two of them in the template.
;; You can also insert or format any text in the template, as shown in the following examples.
;; :quick-capture-templates
;; {:text "[[quick capture]] **{time}**: {text} from {url}"
;; :media "[[quick capture]] **{time}**: {url}"}
;; Quick capture options.
;; - insert-today? Insert the capture at the end of today's journal page (boolean).
;; - redirect-page? Redirect to the quick capture page after capturing (boolean).
;; - default-page The default page to capture to if insert-today? is false (string).
;; :quick-capture-options
;; {:insert-today? false ;; Default value: true
;; :redirect-page? false ;; Default value: false
;; :default-page "quick capture"} ;; Default page: "quick capture"
;; File sync options
;; Ignore these files when syncing, regexp is supported.
;; :file-sync/ignore-files []
;; Configure the Enter key behavior for
;; context-aware editing with DWIM (Do What I Mean).
;; context-aware Enter key behavior implies that pressing Enter will
;; have different outcomes based on the context.
;; For instance, pressing Enter within a list generates a new list item,
;; whereas pressing Enter in a block reference opens the referenced block.
;; :dwim/settings
;; {:admonition&src? true ;; Default value: true
;; :markup? false ;; Default value: false
;; :block-ref? true ;; Default value: true
;; :page-ref? true ;; Default value: true
;; :properties? true ;; Default value: true
;; :list? false} ;; Default value: false
;; Configure the escaping method for special characters in page titles.
;; Warning:
;; This is a dangerous operation. To modify the setting,
;; access the 'Filename format' setting and follow the instructions.
;; Otherwise, You may need to manually rename all affected files and
;; re-index them on all clients after synchronization.
;; Incorrect handling may result in messy page titles.
;; Available options:
;; - :triple-lowbar (default)
;; ;use triple underscore `___` for slash `/` in page title
;; ;use Percent-encoding for other invalid characters
:file/name-format :triple-lowbar}
{:meta/version 1
;; Set the preferred format.
;; Available options:
;; - Markdown (default)
;; - Org
;; :preferred-format "Markdown"
;; Set the preferred workflow style.
;; Available options:
;; - :now for NOW/LATER style (default)
;; - :todo for TODO/DOING style
:preferred-workflow :now
;; Exclude directories/files.
;; Example usage:
;; :hidden ["/archived" "/test.md" "../assets/archived"]
:hidden []
;; Define the default journal page template.
;; Enter the template name between the quotes.
:default-templates
{:journals ""}
;; Set a custom date format for the journal page title.
;; Default value: "MMM do, yyyy"
;; e.g., "Jan 19th, 2038"
;; Example usage e.g., "Tue 19th, Jan 2038"
;; :journal/page-title-format "EEE do, MMM yyyy"
;; Specify the journal filename format using a valid date format string.
;; !Warning:
;; This configuration is not retroactive and affects only new journals.
;; To show old journal files in the app, manually rename the files in the
;; journal directory to match the new format.
;; Default value: "yyyy_MM_dd"
;; :journal/file-name-format "yyyy_MM_dd"
;; Enable tooltip preview on hover.
;; Default value: true
:ui/enable-tooltip? true
;; Display brackets [[]] around page references.
;; Default value: true
;; :ui/show-brackets? true
;; Display all lines of a block when referencing ((block)).
;; Default value: false
:ui/show-full-blocks? false
;; Automatically expand block references when zooming in.
;; Default value: true
:ui/auto-expand-block-refs? true
;; Enable Block timestamps.
;; Default value: false
:feature/enable-block-timestamps? false
;; Disable accent marks when searching.
;; After changing this setting, rebuild the search index by pressing (^C ^S).
;; Default value: true
:feature/enable-search-remove-accents? true
;; Enable journals.
;; Default value: true
;; :feature/enable-journals? true
;; Enable flashcards.
;; Default value: true
;; :feature/enable-flashcards? true
;; Enable whiteboards.
;; Default value: true
;; :feature/enable-whiteboards? true
;; Disable the journal's built-in 'Scheduled tasks and deadlines' query.
;; Default value: false
;; :feature/disable-scheduled-and-deadline-query? false
;; Specify the number of days displayed in the future for
;; the 'scheduled tasks and deadlines' query.
;; Example usage:
;; Display all scheduled and deadline blocks for the next 14 days:
;; :scheduled/future-days 14
;; Default value: 7
;; :scheduled/future-days 7
;; Specify the first day of the week.
;; Available options:
;; - integer from 0 to 6 (Monday to Sunday)
;; Default value: 6 (Sunday)
:start-of-week 6
;; Specify a custom CSS import.
;; This option takes precedence over the local `logseq/custom.css` file.
;; Example usage:
;; :custom-css-url "@import url('https://cdn.jsdelivr.net/gh/dracula/logseq@master/custom.css');"
;; Specify a custom JS import.
;; This option takes precedence over the local `logseq/custom.js` file.
;; Example usage:
;; :custom-js-url "https://cdn.logseq.com/custom.js"
;; Set a custom Arweave gateway
;; Default gateway: https://arweave.net
;; :arweave/gateway "https://arweave.net"
;; Set bullet indentation when exporting
;; Available options:
;; - `:eight-spaces` as eight spaces
;; - `:four-spaces` as four spaces
;; - `:two-spaces` as two spaces
;; - `:tab` as a tab character (default)
;; :export/bullet-indentation :tab
;; Publish all pages within the Graph
;; Regardless of whether individual pages have been marked as public.
;; Default value: false
;; :publishing/all-pages-public? false
;; Define the default home page and sidebar status.
;; If unspecified, the journal page will be loaded on startup and the right sidebar will stay hidden.
;; The `:page` value represents the name of the page displayed at startup.
;; Available options for `:sidebar` are:
;; - "Contents" to display the Contents page in the right sidebar.
;; - A specific page name to display in the right sidebar.
;; - An array of multiple pages, e.g., ["Contents" "Page A" "Page B"].
;; If `:sidebar` remains unset, the right sidebar will stay hidden.
;; Examples:
;; 1. Set "Changelog" as the home page and display "Contents" in the right sidebar:
;; :default-home {:page "Changelog", :sidebar "Contents"}
;; 2. Set "Jun 3rd, 2021" as the home page without the right sidebar:
;; :default-home {:page "Jun 3rd, 2021"}
;; 3. Set "home" as the home page and display multiple pages in the right sidebar:
;; :default-home {:page "home", :sidebar ["Page A" "Page B"]}
;; Set the default location for storing notes.
;; Default value: "pages"
;; :pages-directory "pages"
;; Set the default location for storing journals.
;; Default value: "journals"
;; :journals-directory "journals"
;; Set the default location for storing whiteboards.
;; Default value: "whiteboards"
;; :whiteboards-directory "whiteboards"
;; Enabling this option converts
;; [[Grant Ideas]] to [[file:./grant_ideas.org][Grant Ideas]] for org-mode.
;; For more information, visit https://github.com/logseq/logseq/issues/672
;; :org-mode/insert-file-link? false
;; Configure custom shortcuts.
;; Syntax:
;; 1. + indicates simultaneous key presses, e.g., `Ctrl+Shift+a`.
;; 2. A space between keys represents key chords, e.g., `t s` means
;; pressing `t` followed by `s`.
;; 3. mod refers to `Ctrl` for Windows/Linux and `Command` for Mac.
;; 4. Use false to disable a specific shortcut.
;; 5. You can define multiple bindings for a single action, e.g., ["ctrl+j" "down"].
;; The full list of configurable shortcuts is available at:
;; https://github.com/logseq/logseq/blob/master/src/main/frontend/modules/shortcut/config.cljs
;; Example:
;; :shortcuts
;; {:editor/new-block "enter"
;; :editor/new-line "shift+enter"
;; :editor/insert-link "mod+shift+k"
;; :editor/highlight false
;; :ui/toggle-settings "t s"
;; :editor/up ["ctrl+k" "up"]
;; :editor/down ["ctrl+j" "down"]
;; :editor/left ["ctrl+h" "left"]
;; :editor/right ["ctrl+l" "right"]}
:shortcuts {}
;; Configure the behavior of pressing Enter in document mode.
;; if set to true, pressing Enter will create a new block.
;; Default value: false
:shortcut/doc-mode-enter-for-new-block? false
;; Block content larger than `block/content-max-length` will not be searchable
;; or editable for performance.
;; Default value: 10000
:block/content-max-length 10000
;; Display command documentation on hover.
;; Default value: true
:ui/show-command-doc? true
;; Display empty bullet points.
;; Default value: false
:ui/show-empty-bullets? false
;; Pre-defined :view function to use with advanced queries.
:query/views
{:pprint
(fn [r] [:pre.code (pprint r)])}
;; Advanced queries `:result-transform` function.
;; Transform the query result before displaying it.
:query/result-transforms
{:sort-by-priority
(fn [result] (sort-by (fn [h] (get h :block/priority "Z")) result))}
;; The following queries will be displayed at the bottom of today's journal page.
;; The "NOW" query returns tasks with "NOW" or "DOING" status.
;; The "NEXT" query returns tasks with "NOW", "LATER", or "TODO" status.
:default-queries
{:journals
[{:title "🔨 NOW"
:query [:find (pull ?h [*])
:in $ ?start ?today
:where
[?h :block/marker ?marker]
[(contains? #{"NOW" "DOING"} ?marker)]
[?h :block/page ?p]
[?p :block/journal? true]
[?p :block/journal-day ?d]
[(>= ?d ?start)]
[(<= ?d ?today)]]
:inputs [:14d :today]
:result-transform (fn [result]
(sort-by (fn [h]
(get h :block/priority "Z")) result))
:group-by-page? false
:collapsed? false}
{:title "📅 NEXT"
:query [:find (pull ?h [*])
:in $ ?start ?next
:where
[?h :block/marker ?marker]
[(contains? #{"NOW" "LATER" "TODO"} ?marker)]
[?h :block/page ?p]
[?p :block/journal? true]
[?p :block/journal-day ?d]
[(> ?d ?start)]
[(< ?d ?next)]]
:inputs [:today :7d-after]
:group-by-page? false
:collapsed? false}]}
;; Add custom commands to the command palette
;; Example usage:
;; :commands
;; [
;; ["js" "Javascript"]
;; ["md" "Markdown"]
;; ]
:commands []
;; Enable collapsing blocks with titles but no children.
;; By default, only blocks with children can be collapsed.
;; Setting `:outliner/block-title-collapse-enabled?` to true allows collapsing
;; blocks with titles (multiple lines) and content. For example:
;; - block title
;; block content
;; Default value: false
:outliner/block-title-collapse-enabled? false
;; Macros replace texts and will make you more productive.
;; Example usage:
;; Change the :macros value below to:
;; {"poem" "Rose is $1, violet's $2. Life's ordered: Org assists you."}
;; input "{{poem red,blue}}"
;; becomes
;; Rose is red, violet's blue. Life's ordered: Org assists you.
:macros {}
;; Configure the default expansion level for linked references.
;; For example, consider the following block hierarchy:
;; - a [[page]] (level 1)
;; - b (level 2)
;; - c (level 3)
;; - d (level 4)
;;
;; With the default value of level 2, block b will be collapsed.
;; If the level's value is set to 3, block c will be collapsed.
;; Default value: 2
:ref/default-open-blocks-level 2
;; Configure the threshold for linked references before collapsing.
;; Default value: 100
:ref/linked-references-collapsed-threshold 50
;; Graph view configuration.
;; Example usage:
;; :graph/settings
;; {:orphan-pages? true ; Default value: true
;; :builtin-pages? false ; Default value: false
;; :excluded-pages? false ; Default value: false
;; :journal? false} ; Default value: false
;; Graph view configuration.
;; Example usage:
;; :graph/forcesettings
;; {:link-dist 180 ; Default value: 180
;; :charge-strength -600 ; Default value: -600
;; :charge-range 600} ; Default value: 600
;; Favorites to list on the left sidebar
:favorites []
;; Set flashcards interval.
;; Expected value:
;; - Float between 0 and 1
;; higher values result in faster changes to the next review interval.
;; Default value: 0.5
;; :srs/learning-fraction 0.5
;; Set the initial interval after the first successful review of a card.
;; Default value: 4
;; :srs/initial-interval 4
;; Hide specific block properties.
;; Example usage:
;; :block-hidden-properties #{:public :icon}
;; Create a page for all properties.
;; Default value: true
:property-pages/enabled? true
;; Properties to exclude from having property pages
;; Example usage:
;; :property-pages/excludelist #{:duration :author}
;; By default, property value separated by commas will not be treated as
;; page references. You can add properties to enable it.
;; Example usage:
;; :property/separated-by-commas #{:alias :tags}
;; Properties that are ignored when parsing property values for references
;; Example usage:
;; :ignored-page-references-keywords #{:author :website}
;; logbook configuration.
;; :logbook/settings
;; {:with-second-support? false ;limit logbook to minutes, seconds will be eliminated
;; :enabled-in-all-blocks true ;display logbook in all blocks after timetracking
;; :enabled-in-timestamped-blocks false ;don't display logbook at all
;; }
;; Mobile photo upload configuration.
;; :mobile/photo
;; {:allow-editing? true
;; :quality 80}
;; Mobile features options
;; Gestures
;; Example usage:
;; :mobile
;; {:gestures/disabled-in-block-with-tags ["kanban"]}
;; Extra CodeMirror options
;; See https://codemirror.net/5/doc/manual.html#config for possible options
;; Example usage:
;; :editor/extra-codemirror-options
;; {:lineWrapping false ; Default value: false
;; :lineNumbers true ; Default value: true
;; :readOnly false} ; Default value: false
;; Enable logical outdenting
;; Default value: false
;; :editor/logical-outdenting? false
;; Prefer pasting the file when text and a file are in the clipboard.
;; Default value: false
;; :editor/preferred-pasting-file? false
;; Quick capture templates for receiving content from other apps.
;; Each template contains three elements {time}, {text} and {url}, which can be auto-expanded
;; by receiving content from other apps. Note: the {} cannot be omitted.
;; - {time}: capture time
;; - {date}: capture date using current date format, use `[[{date}]]` to get a page reference
;; - {text}: text that users selected before sharing.
;; - {url}: URL or assets path for media files stored in Logseq.
;; You can also reorder them or use only one or two of them in the template.
;; You can also insert or format any text in the template, as shown in the following examples.
;; :quick-capture-templates
;; {:text "[[quick capture]] **{time}**: {text} from {url}"
;; :media "[[quick capture]] **{time}**: {url}"}
;; Quick capture options.
;; - insert-today? Insert the capture at the end of today's journal page (boolean).
;; - redirect-page? Redirect to the quick capture page after capturing (boolean).
;; - default-page The default page to capture to if insert-today? is false (string).
;; :quick-capture-options
;; {:insert-today? false ;; Default value: true
;; :redirect-page? false ;; Default value: false
;; :default-page "quick capture"} ;; Default page: "quick capture"
;; File sync options
;; Ignore these files when syncing, regexp is supported.
;; :file-sync/ignore-files []
;; Configure the Enter key behavior for
;; context-aware editing with DWIM (Do What I Mean).
;; context-aware Enter key behavior implies that pressing Enter will
;; have different outcomes based on the context.
;; For instance, pressing Enter within a list generates a new list item,
;; whereas pressing Enter in a block reference opens the referenced block.
;; :dwim/settings
;; {:admonition&src? true ;; Default value: true
;; :markup? false ;; Default value: false
;; :block-ref? true ;; Default value: true
;; :page-ref? true ;; Default value: true
;; :properties? true ;; Default value: true
;; :list? false} ;; Default value: false
;; Configure the escaping method for special characters in page titles.
;; Warning:
;; This is a dangerous operation. To modify the setting,
;; access the 'Filename format' setting and follow the instructions.
;; Otherwise, You may need to manually rename all affected files and
;; re-index them on all clients after synchronization.
;; Incorrect handling may result in messy page titles.
;; Available options:
;; - :triple-lowbar (default)
;; ;use triple underscore `___` for slash `/` in page title
;; ;use Percent-encoding for other invalid characters
:file/name-format :triple-lowbar}
tags:: bisonex, validation
- ## Variants manqués
- GABRA5 est le plus problématique car il faudrait vraiment baisser les seuils
[mpileup] 1 samples in 1 input files
chr15 26869324 N 19 AaAaaTTATtTTttAattA k:!k!!kFk!!kk!k!k!F
[mpileup] 1 samples in 1 input files
chr15 27114471 N 19 AaAaaTTATtTTttAattA k:!k!!kFk!!kk!k!k!F
```
-
- Probablement un problème des fastq ? Argument en faveurs
- pas eu ce problème sur les autres variants
- bwa-mem semble signaler une erreur
```bash
[M::mem_process_seqs] Processed 734826 reads in 150.735 CPU sec, 6.449 real sec
[W::bseq_read] the 1st file has fewer sequences.
[W::bseq_read] the 1st file has fewer sequences.
[gzclose] buffer error
[bam_sort_core] merg
```
- Effectivement :
```bash
du -hs *63060439*/*
558M 2200519525_63060439/63060439_S92_R1_001.fastq.gz
2.6G 2200519525_63060439/63060439_S92_R2_001.fastq.gz
```
samtools mpileup 2200519525-63060439-GRCh37.aligned.bam -r chr15:27114471-27114471 -Q 0
samtools index 2200519525-63060439-GRCh37.aligned.bam
samtools index 2200519525-63060439-GRCh38-noalt.aligned.bam
samtools mpileup 2200519525-63060439-GRCh38-noalt.aligned.bam -r chr15:26869324-26869324 -Q 0
bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 24 /Work/Groups/bisonex/data/fasta/GRCh37/hg19.p13.plusMT.no_alt_analysis_set/hg19.p13.plusMT.no_alt_analysis_set.fa 63060439_S92_R1_001.fastq.gz 63060439_S92_R2_001.fastq.gz | samtools sort --threads 24 -o 2200519525-63060439-GRCh37.aligned.bam -
bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 24 /Work/Groups/bisonex/data/fasta/GRCh38-noalt/bwa/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna 63060439_S92_R1_001.fastq.gz 63060439_S92_R2_001.fastq.gz | samtools sort --threads 24 -o 2200519525-63060439-GRCh38-noalt.aligned.bam -
- On vérifie que cela ne vient pas de l'alignement : GRCH38-noalt et GRCh37
- ```bash
- [[Centogene config]]
-
- On ne garde que les exomes négatifs et les variants avec gnomAD exome AF <= 1% si dominant, 5% sinon
- Puis on filtre dans libreoffice ceux sans clinique OMIM et avec CADD >= 15
- Enfin, on regarde seulement les tronquant avec pLI > 1
- CDC42BPB NM_006035.4:c.31G>T : classe 4 (Franklin) ? tronquant en début de gène, NMD escape, clinique DI mais overalp seulement sur alopécie
# Ajouter un nouveau paquet
Les nouveaux paquets sont dans pkg/by-name
nix-build -A mypackage
```
nix-shell -p nixpkgs-review --run "nixpkgs-review rev HEAD"
Regarder les variables d\'environement :
env
```
# Débugger un paquet
``` {.bash org-language="sh"}
cd nixpkgs
mkdir lol
cd lol
nix-shell ../ -A kent
```
Le plus simple est d\'utiliser genericBuild avec les différentes phases,
exemple :
``` {.bash org-language="sh"}
phases="checkPhase installPhase" genericBuild
```
Liste des phases : unpackPhase patchPhase configurePhase buildPhase
checkPhase installPhase fixupPhase installCheckPhase distPhase
Voir : <https://nixos.wiki/wiki/Nixpkgs/Create_and_debug_packages>
# Astuces
## Problèes de driver
Utiliser nixGL https://github.com/nix-community/nixGL si l'on n'utilise pas nixos.
```
```
Tester les dépendances
```
Tester dans nixpkgs qu'il compile
```
- Formatteur officiel (en cours d'adoption le 28 mars 2024, https://discourse.nixos.org/t/call-for-testing-nix-formatter/39179/5)
```sh
nix profile install nixpkgs#nixfmt-rfc-style
nixfmt package.nix
```
https://github.com/NixOS/nixpkgs/blob/master/pkgs/README.md
# Validation
# Données réelles
# Inspiration
## Bcbio
Schémas de validation par la communauté: https://github.com/bcbio/bcbio_validation_workflows
- GIAB: Les génomes sont restreints aux exomes
- NA12878: (Caucasian daughter): 65x NovaSeq TruSeq Nano
- NA24385 (Ashkenazi Jewish son): 50x HiSeq x10 dataset from 10x genomics
- NA24631 (
- GIAB exomes
- NA12878 exome -- From Illumina Basespace Public datasets, 200x NovaSeq S2 using the Illumina TruSeq DNA Library Prep for Enrichment with IDT xGen Exome Research Panel v1.0, trimmed to 2x101 for analysis.
- NA24385 exome -- from Oslo University Hospital's contribution to the Genome in a Bottle Project: 150x Illumina HiSeq 2500 with 150bp paired-end reads, Agilent SureSelect Human All Exon V5 kit
- NA24631 exome -- also from Oslo University Hospital's GiaB contribution, 100x Illumina HiSeq 2500 sequencing.
- CHM : Les génomes sont restreints aux exomes
- structural variant
- somatic
## Autre
Méthodologie https://medium.com/dnanexus/benchmarking-state-of-the-art-secondary-variant-calling-pipelines-5472ca6bace7
We downloaded raw HG001 WES reads from NIST FTP site with ~141X on-target coverage.
Raw WES reads for HG002, HG003, HG004, and HG005 were downloaded from the SRA study, SRP047086
https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=study&acc=SRP047086
On a 1 exome pour HG002-HHG004 sur Illumina HiSeq 2500 avec Agilent SureSelect Human All Exon V5 kit.
Ce sont les fichiers disponibles sur le FTP GIAB mais les fastq sont directement accessible sur ENA en avec ces références !
https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP047086&f=assay_type_s%3An%3Awxs%3Ac&o=acc_s%3Aa
https://zenodo.org/records/3597727
# GIAB
## Données GIAB
https://github.com/genome-in-a-bottle/giab_data_indexes
Pour HG002,3,4,5, on a seulement les BAM sur le FTP GIAB (cf tableau)
On a pu retrouver les fastq via https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=study&acc=SRP047086, en filtrant sur AssayType = WXS
SRR2962669 OSLO UNIVERSITY HOSPITAL SRX1453593 Illumina HiSeq 2500 HG002_WES_OUS PAIRED Hybrid Selection HG002 male SRP047086
SRR2962692 OSLO UNIVERSITY HOSPITAL SRX1453614 Illumina HiSeq 2500 HG003_WES_OUS PAIRED Hybrid Selection HG003 male SRP047086
SRR2962693 OSLO UNIVERSITY HOSPITAL SRX1453615 Illumina HiSeq 2500 HG005_WES_OUS PAIRED Hybrid Selection HG005 male SRP047086
SRR2962694 OSLO UNIVERSITY HOSPITAL SRX1453616 Illumina HiSeq 2500 HG004_WES_OUS PAIRED Hybrid Selection HG004 female SRP047086
Si on prend une expérience, on voit que la capture est Agilent SureSelect Human All Exon V5 kit https://www.ncbi.nlm.nih.gov/sra/SRX1453593
Avec ENA on a directement les FASTQ (non accessible facilement sur SRA).
| Patient | Données | Capture | BED | Séquenceur |_Lien
| HG001 | Fastq |_ Nextera Rapid Capture Exome | ftp, hg19 | HiSeq | https://ftp.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/NA12878/Garvan_NA12878_HG001_HiSeq_Exome/
| HG002 | BAM | ? | ? | HiSeq | https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/OsloUniversityHospital_Exome/
| HG003 | BAM | ? | ? | HiSeq | https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG003_NA24149_father/OsloUniversityHospital_Exome/
| HG004 | BAM | ? | ? | HiSeq | https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG004_NA24143_mother/OsloUniversityHospital_Exome/
| HG005 | BAM | ? | ? | HiSeq | https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/OsloUniversityHospital_Exome/
| HG002_ | fastq | Agilent SureSelect All Exon v5 | HiSeq2500 | https://www.ebi.ac.uk/ena/browser/view/SRX1453593
| HG003_ | fastq | Agilent SureSelect All Exon v5 | HiSeq2500 | https://www.ebi.ac.uk/ena/browser/view/SRX1453614
| HG005_ | fastq | Agilent SureSelect All Exon v5 | HiSeq2500 | https://www.ebi.ac.uk/ena/browser/view/SRX1453615
| HG004_ | fastq | Agilent SureSelect All Exon v5 | HiSeq2500 | https://www.ebi.ac.uk/ena/browser/view/SRX1453616
Pas HG006 ni HG007
bcbio utilise le BED en hg19 et le converti en hg38 GIAB : https://github.com/bcbio/bcbio_validation_workflows/blob/master/giab-exome/input/get_data.sh
### Autres
| HG001 | hg38 | HiSeq 4000 | Agilent SureSelect v7 | SRX11061486 | https://github.com/kevinblighe/agilent |
| HG001 | hg38 | NovaSeq 6000 | Agilent SureSelect v7 | SRX11061516 | idem |
| HG001 | hg19 | HiSeq2000 | SeqCap EZ Human Exome Lib v3.0 | SRR1611178 | http://hgdownload.soe.ucsc.edu/gbdb/hg19/exomeProbesets/
| HG001 | hg19 | HiSeq2000 | SeqCap EZ Human Exome Lib v3.0 | SRR1611179 | idem
| HG001 | hg19 | HiSeq2500 | SeqCap EZ Human Exome Lib v3.0 | SRR1611183 | idem
| HG001 | hg19 | HiSeq2500 | SeqCap EZ Human Exome Lib v3.0 | SRR1611184 | idem
Pour Agilent, on prend le fichier *Region.bed
https://emea.support.illumina.com/downloads/truseq-exome-product-files.html
Génome de référenec
[GRC38](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.26_GRCh38/GRCh38_major_release_seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz)
VCF de référence + bed
- [HG001](https://ftp-trace.ncbi.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/latest/GRCh38/)
## Données Google (Baid et al 2020)
https://www.biorxiv.org/content/10.1101/2020.12.11.422022v1.full
Motivation: données de 2016 pour GIAB...
Patients:
- HG002 à HG007
- NA12891-NA12892
| | Agilentv7 | IDT-xGen | Truseq |
| HiSeq 4000 | 50x | 50x, 75x, 100x | 50x 75x |
| Novaseq | 50,75,100 | 50,75,100 | 50,75,100 |
- (Nextera: mentionné mais données non disponibles)
Au total, 42 combinaison (séquenceur + capture avec 50x)
Disponible en téléchargement direct sur
- https://console.cloud.google.com/storage/browser/brain-genomics-public/research/sequencing/fastq?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false
- via aws cli : aws s3 ls --no-sign-request s3://genomics-benchmark-datasets/google-brain/fastq/
Disponibilité des kit
- agilent = sur leur site (login nécessaire)
- idt : accès libre mais à nettoyer ttps://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-hyb-panel-v2-targets-manifest-hg38.txt?sfvrsn=b6c0e207_2
awk '{print $2"\t"$3"\t"$4}' xgen-exome-hyb-panel-v2-targets-manifest-hg38.txt | sed '/^^chr/d' > xgen-exome-hyb-panel-v2-targets-manifest-hg38.bed
- truseq :
https://emea.support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/truseq/truseq-dna-exome/truseq-dna-exome-targeted-regions-manifest-v1-2-bed.zip
#### Choix de la couverture
Tests refait car moins bonnes performances en novaseq qu'en hiseq4000...
On calcule (cf scripts coverage dans exomevalidator) la couverture des BAM fournis par baid2020 pour gatk4. Pour agilent et truseq (les kit fournis), amélioration par rapport aux bed trouvés sur internet (sauf moyenne et médianne pour agilent). Idem pour bisonex -> ouf !
Résultat des tests
| | >= 30x | médiane | moyenne|
| Agilentv7 à 50x| 82.0% | 63x | 73.8
| Cento | 96% | 62-100%
| Idt 75x | 86% | 64x | 76.6 |
Notes :
- Attention en utilisant mosdepth, il ne faut pas regarder les stats en global mais seulement par région si on restreint à un bed ! Avec le BED dans bisonex, multiqc + mosdepth donne
- Exemple de commande
mosdepth --by agilent-GRCh38.bed --fasta GCA_000001405.15_GRCh38_full_analysis_set.fna HG001-HiSeq4000-Agilentv7-GRCh38-v2 HG001-HiSeq4000-Agilentv7-GRCh38.markedduplicates.bam
nextflow run main.nf -profile standard,helios --input=hg001-hiseq400-idt-75x.csv --genome=GRCh38 -with-trace -with-report --capture=capture/xgen-exome-hyb-panel-v2-targets-manifest-hg38.bed -bg
### Également sur NIST ?
Y ressemble suspicieusement : fourni par google, même séquenceur et kit. Par contre, la taille ne correspond pas exactement (50x à 1.4G au lieu de 3 ?)
https://trace.ncbi.nlm.nih.gov/Traces/?view=study&acc=SRP322567
En filtran sur exome
https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP322567&f=assay_type_s%3An%3Awxs%3Ac&o=acc_s%3Aa
On peut télécharger les données dans "metadata" au format CSV
## Données @barbitoff2022
HG001 à 7 avec HiSeq4000 (sauf 1). Tout le monde en agilent sureselect v5 ou v7
BED file disponible en hg19 sur le github, qui contient tous les scripts.
Au final, on a plus de données avec Baid2020...
## Conversion des données bam en fast
bcbio convert le .bed
https://github.com/bcbio/bcbio_validation_workflows/blob/master/giab-exome/input/get_data.sh
# Résultats
## NA12878
Command
```bash
hap.py giab/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz ~/code/bisonex/out/call_variant/haplotypecaller/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.haplotypecaller.vcf --false-positive giab/HG001_GRCh38_1_2 2_v4.2.1_benchmark.bed --target-regions ~/code/bisonex/capture/Twist_Exome_Core_Covered_Targets_hg38.bed --reference genome/GCA_000001405.15_GRCh38_full_analysis_set.fna --engine=vcfeval --engine-vcfeval-template=genome/sdf -o na12878-bisonex-grch38
```
Résultat: ../../../code/data/na12878-bisonex-grch38
## Baid 2020
hap.py avec rtg eval
Pour HG001 à HG007, les résultats sont générés avec exomevalidator (compare-exome permet d'avoir un.summary.csv pour tous les runs).
Ex pour HG001 hiseq4000 agilentsureselect v7 (BED en GRCH38)
Type,Filter,TRUTH.TOTAL,TRUTH.TP,TRUTH.FN,QUERY.TOTAL,QUERY.FP,QUERY.UNK,FP.gt,FP.al,METRIC.Recall,METRIC.Precision,METRIC.Frac_NA,METRIC.F1_Score,TRUTH.TOTAL.TiTv_ratio,QUERY.TOTAL.TiTv_ratio,TRUTH.TOTAL.het_hom_ratio,QUERY.TOTAL.het_hom_ratio,run
INDEL,ALL,549,489,60,899,62,340,17,9,0.89071,0.889088,0.378198,0.889898,,,1.86096256684492,2.271062271062271,HG001-HiSeq4000-Agilentv7-50x-GRCh38.haplotypecaller.summary
INDEL,PASS,549,489,60,899,62,340,17,9,0.89071,0.889088,0.378198,0.889898,,,1.86096256684492,2.271062271062271,HG001-HiSeq4000-Agilentv7-50x-GRCh38.haplotypecaller.summary
SNP,ALL,21973,21466,507,26288,562,4266,69,72,0.976926,0.97448,0.162279,0.975702,3.007110300820419,2.7840287769784173,1.5918102430965306,1.8152593227603944,HG001-HiSeq4000-Agilentv7-50x-GRCh38.haplotypecaller.summary
## Comparaison avec Baid2020
figure
on utilise leur vcf. Avec plot/giab.pl, on compare avec bisonex.
Résultats assez surprenant car la répartition est différente, même pour bwa-mem + gatk.
2 différences
1. C'est la version no_alt qui est utilisée
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for
_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
2. haplotypecaller utilise -L pour restreindre à la zone de capture
Probablement lié au génome de référence mais on est meilleur en tout cas.
Avec le .bed fourni par baid2020 (agilent + idt), on a des résultats différents pour l'analyse par capture ou séquencer. Mais novaseq n'est pas moins bon
Étude de l'impact de la couverture: ne semble pas améliorer (cf plots/coverage.jl)
Testé sur mean, median, >= 30.
# Varben
- Installer bedtools, samtools et bwa
- Code a du être converti en python3 (avec 2to3)
- Validé sur chr20 et un test fournis (cf varben-journal.md)
**Important**
- il faut enlever les "hardclip" du bam (cf varben-journal.md) )
- pour la conversion bam -> fastqc, il faut trier le bam avant (sinon on perd trop de read)
# Variants sanger sur NA12878 bisonex
## Création du BAM
On utilise les données NA12878. Sampleesheet `na12878.csv`
```
patient,sample,fastq1,fastq2
2300346867_63118093,NA12878,/Work/Groups/bisonex/centogene/fastq/2300346867_63118093_NA12878/63118093_S260_R1_001.fastq.gz,/Work/
Groups/bisonex/centogene/fastq/2300346867_63118093_NA12878/63118093_S260_R2_001.fastq.gz
```
Pipeline
```bash
nextflow run main.nf -profile standard,helios --genome=GRCh38 --input=na12878.csv -with-report na12878.report -with-trace -bg
```
Il faut ensuite supprimer les hardclip pour varben
```bash
cd out/preprocessing/recalibrated/2300346867_63118093-NA12878-GRCh38
samtools view -h 2300346867_63118093-NA12878-GRCh38.recal.bam | awk '$6 !~ /H/{print}' | samtools view -bS - > 2300346867_63118093-NA12878-GRCh38.recal.nohardclip.bam
```
On ajoute les génomes :
```bash
cd /Work/Users/apraga/exomevalidator
ln -s /Work/Groups/bisonex/data/fasta/GRCh38/bwa/* .
ln -s /Work/Groups/bisonex/data/fasta/GRCh38/GCA_000001405.15_GRCh38_full_analysis_set.fna .
ln -s /Work/Groups/bisonex/data/fasta/GRCh38/GCA_000001405.15_GRCh38_full_analysis_set.fna.fai
```
## Préparation des variants
On utilise `../code/varben/tovarben.nu` pour formatter la liste de variants au format adéquat pour varben `sanger_variants.tsv`.
#### Attention aux duplications
Varben ne supporte pas les duplications en mode mutation (seulement variants structurels). Il faut les écrire sous forme d'insertion.
Ex:
g.169821134dup correspond à la duplication de T en position 169821134.
La notation VCF est correcte 169821133 G GT
Cela correspond à une insertion entre 169821133 et 169821134 d'un T, soit
169821133 169821134 0.5 ins T
Il faut donc bien supprimer le premier nucléotide de la représentation VCF !
## Insertion des variants
Avec
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="varben"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 4:00:00
#SBATCH --partition=smp
#SBATCH -c 1 ## request 16 cores (MAX is 32)
#SBATCH --mem=16G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix
muteditor -r ../GCA_000001405.15_GRCh38_full_analysis_set.fna --aligner bwa --alignerIndex ../GCA_000001405.15_GRCh38_full_analysis_set.fna -o na12878-sanger -m ../sanger_varben.tsv --mindepth 5 -b ../../bisonex/out/preprocessing/recalibrated/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.recal.nohardclip.bam
```
Dans exomevalidator/scripts,
```bash
sbatch varben.slurm
```
59min temps écoulé, 3h50 de temps CPU (mais est-ce vraiment parallélisé).
Un seul variant n'a pas été inséré car 0 read : SNV chr21 43426167
## Conversion en fastq
On utilise initialement bam2fastq, mais après vérification auprès d'Alexis + internet, on passe à samtools fastq.
```
cd na12878-sanger/
cp edit.sorted.bam NA12878-sanger-GRCh38.bam
```
On utilise bam2fastq2.slurm avec 4 threads (~12min au lieu de 50 en séquentiel)
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="bam2fastq"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 4:00:00
#SBATCH --partition=smp
#SBATCH -c 4 ## request 16 cores (MAX is 32)
#SBATCH --mem=24G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
dir=na12878-sanger
prefix=NA12878-sanger-GRCh38
bam=$dir/$prefix.bam
sorted=$dir/${prefix}-sorted.bam
cores=4
samtools sort -n $bam -o $sorted -@ $cores
samtools fastq -1 $dir/${prefix}_R1_001.fastq -2 $dir/${prefix}_R2_001.fastq -0 /dev/null -s /dev/null -n -@ $cores $sorted
```
## Pipeline
Avec le samplesheet
```sanger-varben.csv
patient,sample,fastq1,fastq2
NA12878,sanger-varben,/Work/Users/apraga/exomevalidator/scripts/na12878-sanger/NA12878-sanger-GRCh38_R1_001.fastq,/Work/Users/apraga/exomevalidator/scripts/na12878-sanger/NA12878-sanger-GRCh38_R2_001.fastq
```
On utilise un script slurm pour lancer nextflow sur helios (voir helios.md)
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="bisonex-varben"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 24:00:00
#SBATCH --partition=smp
#SBATCH -c 1 ## request 16 cores (MAX is 32)
#SBATCH --mem=12G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix/2.11.0
# Otherwise job fails as it cannot write to $HOME/.nextflow
export NXF_HOME=/Work/Users/apraga/.nextflow
# user.name must be forced (again... our fix does not seem to work in nix)
nextflow -Duser.name=apraga run main.nf -profile standard,helios --input=sanger-varben.csv --genome=GRCh38 -resume 8749b270-ca2c-4f72-82d9-69fff6563c56
```
# Comparaison
Pour variants sanger, génération d'un VCF pour utiliser rtg vcfeval. Cf [script nu](../code/varben/tovcf.nu)
vcfeval requiert un champ GT, ajouté par le script en fonction de la zygotie ( `--sample ALT`).
## Test sur chr1:
Threshold True-pos-baseline True-pos-call False-pos False-neg Precision Sensitivity F-measure
----------------------------------------------------------------------------------------------------
94.000 4 4 9508 7 0.0004 0.3636 0.0008
None 10 10 482190 1 0.0000 0.9091 0.0000
Le faux négatif est homozygote mais appelé par erreur hétérozygote. C'est surtout du à un faible nombre de read : 2/12 portent le variant
## Tous les variants
### Appel de variants
```bash
rm -rf sanger-varben
rtg vcfeval -b ~/research/bisonex/code/varben/sanger.vcf.gz -c ~/code/bisonex/out/call_variant/haplotype caller/NA12878-sanger-varben-GRCh38/NA12878-sanger-varben-GRCh38.haplotypecaller.vcf.gz -t genome/sdf -o sanger-varben --output-m ode=annotate
```
- 1 faux négatif qui n'a effectivement pas été inséré par varben par manque de reads `zgrep FN sanger-varben/baseline.vcf.gz | zgrep -v FN_CA`
- 13 négatifs mais sur la zygosity (alors qu'ils ont été bien inséré ) `zgrep FN sanger-varben/baseline.vcf.gz | zgrep FN_CA`
### Filtre après l'appel de variants
Filtre sur la profondeur: une délétion est filtrée en plus car 21 reads seulements en chr6 72622255
```nu
let f = "../bisonex/out/call_variant/filter/NA12878-sanger-varben-GRCh38/NA12878-sanger-varben-GRCh38.filter.depth
.vcf"
bgzip $f ; tabix -p vcf $"($f).gz" ; rtg vcfeval -b ~/research/bisonex/code/varben/sanger.vcf.gz -c $"($f).gz" -t g
enome/sdf -o sanger-varben-filter-depth --output-mode=annotate
```
Filtre sur SNP: idem
```nu
let f = "../bisonex/out/call_variant/filter/NA12878-sanger-varben-GRCh38/NA12878-sanger-varben-GRCh38.filter.polymormphisms.vcf"
bgzip $f ; tabix -p vcf $"($f).gz" ; rtg vcfeval -b ~/research/bisonex/code/varben/sanger.vcf.gz -c $"($f).gz" -t genome/sdf -o sanger-varben-filter-snp --output-mode=annotate
```
### Filtre après annotation
Idem !!
```nu
let f = "../bisonex/out/annotate/filter/NA12878-sanger-varben-GRCh38/NA12878-sanger-varben-GRCh38.filtervep.vcf"
bgzip $f ; tabix -p vcf $"($f).gz" ; rtg vcfeval -b ~/research/bisonex/code/varben/sanger.vcf.gz -c $"($f).gz" -t genome/sdf -o sanger-varben-filter-vep --output-mode=annotate# TODO
```
## TODO
- lancer l'exécution depuis cli en Rust
- télécharger index du genome (.fnai + bwa)
# Test in silico: journal
## Varben
### Tests fournis :
cd examples
xz -d reference/chr*.fna.zx
cd src
Pour illumina
bash 03-spikein_snv_and_indel_Illumina.sh### Données clinvar ?
Sur ce cas test, deletion, insertion et SNV ok. Substitution de toute une séquence aussi "sub"
chr7 55221748 55221748 0.15 del .
chr7 55227923 55227923 0.33 ins TGTTCC
chr7 55242467 55242481 0.03 sub TTC
chr7 55249038 55249038 0.03 snv A
chr7 55249092 55249092 0.05 snv C
chr7 55259515 55259515 0.03 snv G
### chr20 sur HG001
Après packagé avec nix, il faut formatter les données
```nu
cd ~/research/bisonex/code/varben
let df = open ../parsevariants/variants_centogene_final.csv | where "Confirmed in sanger" == "true" and "genomic (hg38)" != "" | insert AF {|e| if ($e.zygosity == "heterozygous") { random float 0.4..0.5 } else { random float 0.8..1 } } | select "genomic (hg38)" chrom pos AF alt | rename genomic`
$df | insert type {|e| if ($e.genomic | str contains ">") { "SNV" } else if ($e.genomic | str contains "del") { "del" } else if ($e.genomic | str contains "ins") { "ins"} else if ($e.genomic | str contains "dup") { "dup"} else { $e.genomic }} | | insert chrom2 {|e| $"chr($e.chrom)" } | insert end {|e| $e.pos} | select chrom2 pos end AF type alt | sort-by chrom2 pos end | to csv -s '\t' | save sanger_varben.tsv -f
```
Et télécharger index + index bwa
Avec
```bash
cd ~/code/exomevalidator
rg chr20 ~/research/bisonex/code/varben/sanger_varben.tsv | save sanger_varben_chr20.tsv -f
```
Puis
```bash
./result/bin/muteditor -r GCA_000001405.15_GRCh38_full_analysis_set.fna -b ~/code/bisonex/out/preprocessing/recalibrated/HG001-HiSeq4000-Agilentv7-50x-GRCh38/HG001-HiSeq4000-Agilentv7-50x-GRCh38.recal-chr20.bam --aligner bwa --alignerIndex GCA_000001405.15_GRCh38_full_analysis_set.fna -o chr20 -m sanger_varben_chr20.tsv
```
À noter que l'exécution plante parfois (avec notre version packagée avec nix seulement ?):
samtools sort -n -o chr20/tempDir/edit.sortByName.bam chr20/tempDir/edit.bam
[E::bam_read1] CIGAR and query sequence lengths differ for K00141:389:HGNTWBBXX:3:1209:1479:7855
samtools sort: truncated file. Aborting
Ok en relancant...
Puis conversion en fastq (déjà trié apparement)
```bash
cd chr20
samtools bam2fq -n -@ 4 -1 chr20-snv_1.fq.gz -2 chr20-snv_2.fq.gz -0 chr20-snv_other.fq.gz -s chr20-snv_singleton.fq.gz edit.sorted.bam
```
On teste l'appel de variants
rsync -avz chr20-snv_1.fq.gz chr20-snv_2.fq.gz meso:/Work/Users/apraga/bisonex/test/varben/
Sur meso, faire varben.csv
patient,sample,fastq1,fastq2
varben-sanger,chr20,test/varben/chr20-snv_1.fq.gz,test/varben/chr20-snv_2.fq.gz
nextflow run main.nf -profile standard,helios --genome=GRCh38 --input=varben.csv
On le trouve bien après l'appel de variant :)
bcftools view work/ff/35fa5353da6da53e6914c1d67f7a79/varben-sanger-chr20-GRCh38.haplotypecaller.vcf.gz chr20:46043645-46043645
Mais il est filtré sur la profondeur car 25 reads et le seuil est à 30 :(
Possible car on utilise des données "faible profondeur" de bai2020 50x
NB: spip plante si entrée vide
Test insertion deletion ok. Test duplication: non fait
#### Erreur avec hardclip
Erreur :
```bash
Exception: Cmd Error: samtools sort -n -o na12878-sanger/tempDir/edit.sortByName.bam na12878-sanger/tempDir/edit.bam
```
D'après https://github.com/nccl-jmli/VarBen/issues/18, il faut enlever les "hardclip"
```bash
cd ../../bisonex/out/preprocessing/recalibrated/2300346867_NA12878-63118093_S260-GRCh38/
samtools view -b -F 0x100 2300346867_63118093-NA12878-GRCh38.recal.bam > 2300346867_63118093-NA12878-GRCh38.recal.nohardclip.bam
```
Mais pareil... On essaye la seconde suggestion
```bash
samtools view -h 2300346867_63118093-NA12878-GRCh38.recal.bam | awk '$6 !~ /H/{print}' | samtools view -bS - > 2300346867_63118093-NA12878-GRCh38.recal.nohardclip.bam
```
Ok !
### Dup inséres aux mauvais endroit
3e version : les bornes sont toujours pos et pos+1 et on supprimer le premier nucléotide de ALT.
Vérification des 4 dup
# Données de synthèse
# Tests _in silico_
## Insertion de variants dans un patient
- ExomeValidator.jl: Notre version en Julia. OK pour SNV, testé sur variants confirmés en snager
- [Varben](varben.md)
## Données simulée
- [Simuscop](simuscop) : On utilise simuscope car gère facilement l'exome (sinon il faut filtrer après la génération de données de génome) et rapide.
# Simuscop
## NA12878
### Génération du profile
Test dans exome validator
Il faut un vcf non compressé puis générer le modele
```bash
cd baid2020/grch38/vcf/hiseq4000/wes_agilent/50x/ && gunzip HG001.hiseq4000.wes-agilent.50x.gatk4.grch38.vcf
./result/bin/seqToProfile -b baid2020/grch38/bam/hiseq4000/wes_agilent/50x/HG001.hiseq4000.wes-agilent.50x.dedup.grch38.bam -t baid2020/bed/ -v baid2020/grch38/vcf/hiseq4000/wes_agilent/50x/HG001.hiseq4000.wes-agilent.50x.gatk4.grch38.vcf -r GCA_000001405.15_GRCh38_full_analysis_set.fna.gz | save hg001-hiseq4000-agilent-grch38.model
```
Avec bisonex
```bash
gunzip ../bisonex/out/call_variant/haplotypecaller/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.haplotypecaller.vcf.gz
```
Génération du profil (7min en séquentiel)
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="seqToProfile"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 4:00:00
#SBATCH --partition=smp
#SBATCH -c 1 ## request 16 cores (MAX is 32)
#SBATCH --mem=16G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix
./result/bin/seqToProfile -b ../bisonex/out/preprocessing/recalibrated/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.recal.bam -t ../bisonex/capture/Twist_Exome_Core_Covered_Targets_hg38.bed -v ../bisonex/out/call_variant/haplotypecaller/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.haplotypecaller.vcf -r genome/GCA_000001405.15_GRCh38_full_analysis_set.fna > na12878-bisonex-grch38.model
```
## Fichiers de configuration
Le fichier définissant les variants doit être séparé par des tabulations.
### Variants sangers sur HG001
Fichiers de configurations : reads de 150bp et profondeur de 100 (200 = 11G par fastq alors que notre cible est 3).
```
ref = ../genome/GCA_000001405.15_GRCh38_full_analysis_set.fna
profile = na12878-bisonex-grch38.model
variation = ./sanger_simuscop.txt
target = ../../bisonex/capture/Twist_Exome_Core_Covered_Targets_hg38.bed
name = bisonex
output = na12878-simuscop
layout = PE
threads = 4
verbose = 1
coverage = 100
# According to the bam
insertSize = 150
```
Le fichier définissant les variants doit être séparé par des tabulations. Il est généré par `~/research/bisonex/code/simuscop/tosimuscop.nu`
Pour les variations:
- la position des délétions définit le début de la délétion exactement et la longueur est le nombre de bases supprimées (logique...)
- l'insertion se fera entre la position définie et celle d'après (logique aussi)
F
### Insertion
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="simuscop"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 4:00:00
#SBATCH --partition=smp
#SBATCH -c 4 ## request 16 cores (MAX is 32)
#SBATCH --mem=16G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix
simuReads sanger-simuscop.config
```
## Comparaison variants sangers NA12878
Appel de variant
11 faux négatifs
- 4 "vrais", que des SNVs
- chr10 71741823 : non présent dans bed ?
- chr17 7906605: 3 reads sur 34 dans le BAM donc non logique qu'il ne soit pas détecté
- chr19 42273954 : non présent dans BED ?
- chr21 43426167 : devrait le détecter.
- 7 sur la zygosity: tous SNVS et tous (hormis 1) ont 100% de leur reads porteur de l'allèle alternative... Limite de ce système avec haplotypecaller ?
- chr1 39388062
- chr3 9436861
- chr9 137452819
- chr11 77337387
- chr12 51786689
- chr19 5077400
- chr19 13506173
Études igv
samtools view NA12878-sanger-simuscop-GRCh38.recal.bam chr10:71741823-71741823 chr17:7906605-7906605 chr19:42273954-42273954 chr21:43426167-43426167 -b > simuscop-missed.bam
samtools view 2300346867_63118093-NA12878-GRCh38.recal.bam chr10:71741823-71741823 chr17:7906605-7906605 chr19:42273954-42273954 chr21:43426167-43426167 -b > simuscop-missed-ref.bam
Région non couverte. On teste les fichier de capture suivant
- hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed
- hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed
- Twist_Comprehensive_Exome_Covered_Targets_hg38.bed
- Twist_Exome_Core_Covered_Targets_hg38.bed
- Twist_Exome_RefSeq_targets_hg38.bed
L'exome core covered ne contient pas (celui qu'on utilise) !! Les autres oui. Il manque
- l'exon 2 de /CIC/ (chr19)
- l'exon 38 de CDH38 (non visible IGV en Refseq)'un gène non connu dans RefSeq
On teste la couverture du BAM pour chacun des BED (voir [configuration](validation/configuration.md): clairement pas le meilleur.
On reste là-dessus vu qu'on n'a pas la specification de Cento.
## Notes
La profondeur semble être la profondeur maximale au centre de l'intervalle.
## Clinvar
On utilise le script suivant (aussi dans exomevalidator/clinvar/clinvar.nu pour le moment)
```nu
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
# Ajout du header
zgrep '^#' ../../data/clinvar.vcf.gz | save clinvar_twist_exome.vcf
# Suppression des chr
sed 's/^chr//' ../../../bisonex/capture/Twist_Exome_Core_Covered_Targets_hg38.bed | save Twist_Exome_Core_Covered_Targets_hg38_nochr.bed
# Intersection
bedtools intersect -a ../../data/clinvar.vcf.gz -b Twist_Exome_Core_Covered_Targets_hg38_nochr.bed | save -a clinvar
bgzip clinvar_twist_exome.vcf
tabix -p vcf clinvar_twist_exome.vcf.gz
# Seulement classe 4 u 5
bcftools view -i 'INFO/CLNSIG == "Pathogenic" || INFO/CLNSIG == "Likely_pathogenic"' clinvar_twist_exome.vcf.gz -o clinvar_twist_exome.vcf_patho.gz
# Extraction en nu en enlevant les gros remaniment (> 10bp)
bcftools query -f '%CHROM;%POS;%REF;%ALT;%INFO/CLNSIG;%INFO/CLNVC' clinvar_twist_exome.vcf_patho.gz | from csv -s ';' -n | where ($it.column3 | str length) < 10 and ($it.column4 | str length) < 10 | to csv -s '\t' | save clinvar_twist_patho_small.tsv
# On prend des variants à au moins 10bp d'écart
awk 'BEGIN{prev=0}{ if ($2-$prev>10) { print $0; prev=$2}}' clinvar_twist_patho_small.tsv | save clinvar_twist_patho_sampled.tsv
# On extrait les duplications, insertion, deletion, snv
let df = open "clinvar_twist_patho_sampled.tsv" --raw | from tsv -n | update column6 {|e| if ($e.column6 == "Deletion") {"d"} else if ($e.column6 == "Insertion" or $e.column6 == "Duplication") { "i" } else if ($e.column6 == "single_nucleotide_variant") {"s"}} | where not ($it.column6 | is-empty)
let df2 = $df | update column1 {|e| $"chr($e.column1)"} | insert pop "bisonex" | insert zyg "het"
$df2 | where column6 == "i" | update column4 {|e| $e.column4 | str substring 1.. } | select column6 pop column1 column2 column4 zyg | to tsv -n | save clinvar-simuscop.tsv
$df2 | where column6 == "d" | update column2 {|e| $e.column2 + 1} | insert length {|e| ($e.column3 | str length) - ($e.column4 | str length)} | select column6 pop column1 column2 length zyg | to tsv -n | save -a clinvar-simuscop.tsv
$df2 | where column6 == "s" | select column6 pop column1 column2 column3 column4 zyg | to tsv -n | save -a clinvar-simuscop.tsv
```
On vérifie après coup qu'on a bien les patho/probabelement patho seuls
```nu
❯ bcftools query -f '%INFO/CLNSIG' clinvar_twist_exome.vcf_patho.gz | lines | sort | uniq -c
╭───┬───────────────────────────────────┬────────╮
│ # │ value │ count │
├───┼───────────────────────────────────┼────────┤
│ 0 │ Likely_pathogenic │ 58885 │
│ 1 │ Likely_pathogenic,_low_penetrance │ 6 │
│ 2 │ Pathogenic │ 143575 │
```
Pour comparer les variants, ils nous faut un VCF sans les gros indel. Le script ci-dessus l'a écrit au format CSV donc :
```nu
zgrep '^#' clinvar_twist_exome.vcf_patho.gz | save clinvar_ref.vcf -f
zgrep -v '^#' clinvar_twist_exome.vcf_patho.gz | from tsv -n | where ($it.column4 | str length) < 10 and ($it.column5 | str length) < 10 | to tsv -n | save clinvar_ref.vcf -a
```
Et il faut rajouter "chr" pour la comparaison
sed '/^#/! s/^/chr/' clinvar_ref.vcf -i.bak
bgzip clinvar_ref.vcf
tabix -p vcf clinvar_ref.vcf.gz
Il manque la colonne GT donc on désactive le sample
rtg vcfeval -b scripts/clinvar/clinvar_ref.vcf.gz -c ~/code/bisonex/out/call_variant/haplotypecaller/NA12878-clinvar-simuscop-GRCh38/NA12878-clinvar-simuscop-GRCh38.haplotypecaller.vcf.gz -t genome/sdf -o clinvar-simuscop --output-mode=annotate --sample ALT
Résultat : très nombreux avertissements que certaines des zones trop complexes n'ont pas pu être évalée
Threshold True-pos-baseline True-pos-call False-pos False-neg Precision Sensitivity F-measure
----------------------------------------------------------------------------------------------------
None 141609 132069 1306 44081 0.9902 0.7626 0.8616
Autre test :
bedtools intersect -a clinvar_ref.vcf.gz -b ~/code/bisonex/out/call_variant/haplotypecaller/NA12878-clinvar-simuscop-GRCh38/NA12878-clinvar-simuscop-GRCh38.haplotypecaller.vcf.gz -v | wc -l
42140
Pour
zgrep -v '^#' clinvar_ref.vcf.gz | wc -l
189038
# Test couverture
```test.config
## [required] fasta reference file from which reads will be sampled
ref = ../genome/GCA_000001405.15_GRCh38_full_analysis_set.fna
## [required] a model file generated by "seqToProfile" utility
profile = na12878-bisonex-grch38.model
## [optional] files defining the variations (indel, SNV and CNV) to simulate
variation = ./variations.txt
## [optional] file defining target regions for sequencing. If the argument is not specified, sequencing data of all chromosomes defined in the reference will be produced.
target = ./test.bed
## [required] popolation names (comma-separated).
name = test
## [required] output directory
output = ./simuscop
## [optional] sequence layout, should be "SE" for single end or "PE" for paired-end (default: "SE")
layout = PE
## [optional] the number of threads to use (default: 1)
threads = 2
## [optional] print the intermediate information, should be 0 or 1 (default: 1)
verbose = 1
## [required] sequencing coverage
coverage = 100
## [optional] insert size for paired end sequencing (default: 350, only effective when the "layout" is set to "PE")
insertSize = 200
```
../result/bin/simuRead test.config
bwa mem ../GCA_000001405.15_GRCh38_full_analysis_set.fna simuscop/test_1.fq simuscop/test_2.fq | samtools sort -o test.sorted.bam -
Sur le premier intervalle du bed Twist exome core, simuscop fait une "gaussienne" sur l'exon. Les données Cento ont un trou au milieu de l'exon entre 2 pics...
## Comparaison de variant : sanger sur NA12878
["filter", "haplotypecaller"] | each {|e| rsync -avz $"meso:/Work/Users/apraga/bisonex/run-simuscop/out/call_variant/($e)/NA12878-sanger-simuscop-GRCh38" $"out/call_variant/($e)/" }
rtg vcfeval -b ~/research/bisonex/code/varben/sanger.vcf.gz -c ~/code/bisonex/out/call_variant/haplotypecaller/NA12878-sanger-simuscop-GRCh38/NA12878-sanger-simuscop-GRCh38.haplotypecaller.vcf.gz -t genome/sdf -o sanger-simuscop --output-mode=annotate
# Patient synthétique
# Configuration
## Choix du kit de capture
Dans les compte-rendu, "Twist Human Core Exome Plus"
On utilise la version en ligne https://www.twistbioscience.com/resources/data-files/ngs-human-core-exome-panel-bed-file , qui n'est pas "plus"
Le problème est que cela ne couvre pas un variant rendu et confirmé en Sanger... Plusieurs possibilités
- la version Plus n'est pas disponible en ligne mais la contient -> possible
- le kit a été mis à jour (2021 ?) -> possible
- c'est un problème lié à hg38 -> non, idem en hg19 (et il a l'air lifté du hg38)
En comparant avec le BAM, il y a clairement des endroits manquants et ce n'est pas le meilleur... En terme de couverture, oui.
### Vérification de la qualité
#### Couverture
Source: https://www.twistbioscience.com/resources/technical-resources?twist-pick=-1&products=-1&sub-products=-1&application=411&year=-1
- hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed
- hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed
- Twist_Comprehensive_Exome_Covered_Targets_hg38.bed
- Twist_Exome_Core_Covered_Targets_hg38.bed
- Twist_Exome_RefSeq_targets_hg38.bed
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="mosdepth"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 4:00:00
#SBATCH --partition=smp
#SBATCH -c 4 ## request 16 cores (MAX is 32)
#SBATCH --mem=12G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix/2.11.0
# JULIA_HISTORY=.julia julia mosdepth.jl
bam=out/preprocessing/recalibrated/2300346867_63118093-NA12878-GRCh38/2300346867_63118093-NA12878-GRCh38.recal.bam
mkdir test-capture
mosdepth -b capture/hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed test-capture/na12878-hg38_exome_comp_spikein_v2.0 $bam
mosdepth -b capture/hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed test-capture/na12878-hg38_exome_v2.0.2 $bam
mosdepth -b capture/Twist_Comprehensive_Exome_Covered_Targets_hg38.bed test-capture/na12878-Twist_Comprehensive_Exome $bam
mosdepth -b capture/Twist_Exome_Core_Covered_Targets_hg38.bed test-capture/na12878-Twist_Exome_Core $bam
mosdepth -b capture/Twist_Exome_RefSeq_targets_hg38.bed test-capture/na12878-Twist_Exome $bam
```
Résultat:
Sample Name ≥ 30X Median Mean Cov.
na12878-Twist_Comprehensive_Exome 96.0% 66.0X 69.4X
na12878-Twist_Exome 96.0% 66.0X 69.3X
na12878-Twist_Exome_Core 97.0% 66.0X 69.4X
na12878-hg38_exome_comp_spikein_v2.0 94.0% 65.0X 68.2X
na12878-hg38_exome_v2.0.2 95.0% 65.0X 67.4X
Twist Exome Core a les meilleurs résultats mais il y a des zones qui devraient être couvertes mais qui ne le sont pas.
### Par rapport au bam
On utilise [bamtocov](https://github.com/telatin/bamtocov) pour générer un fichier de captuer à partir du bam. (NB: covtobam ne fait pas l'union non plus des intervalles...)
Mais cela donne les couvertures par position alors que covtobed donne l'union des positions >= seuil (pourtant bamtocov est plus récent). On prend le second
```bash
conda create --name bamtocov-env
conda activate bamtocov-env
conda install -y -c bioconda bamtocov
bamtocov 2300346867_63118093-NA12878-GRCh38.recal.bam | awk '$4 >= 30' > coverage_gt30.bed
```
On fusionne les intervalles consécutifs
```bash
bedtools merge -i coverage_gt30.bed > coverage_gt30_merged.bed
```
On regarde s'il existe des régions dans notre BAM qui ne sont pas dans les différents kit de capture: option -v de bedtools.
En calculant le nombre d'intervalles absents du kit et la taille totale des absences:
```nu
[Twist_Comprehensive_Exome_Covered_Targets_hg38.bed Twist_Exome_Core_Covered_Targets_hg38.bed Twist_Exome_RefSeq_ta
rgets_hg38.bed hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed hg38_exome_v2.0.2_targets_sorted_valida
ted.re_annotated.bed] | each {|e| print $e; bedtools intersect -v -a coverage_gt30_merged.bed -b $e | awk 'BEGIN{SUM
=0}{SUM+=$3-$2}END{print SUM}' }
Twist_Comprehensive_Exome_Covered_Targets_hg38.bed
9817186
Twist_Exome_Core_Covered_Targets_hg38.bed
15407667
Twist_Exome_RefSeq_targets_hg38.bed
9715995
hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed
9479802
hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed
10820313
```
```nu
[Twist_Comprehensive_Exome_Covered_Targets_hg38.bed Twist_Exome_Core_Covered_Targets_hg38.bed Twist_Exome_RefSeq_ta
rgets_hg38.bed hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed hg38_exome_v2.0.2_targets_sorted_valida
ted.re_annotated.bed] | each {|e| print $e; bedtools intersect -v -a coverage_gt30_merged.bed -b $e | wc -l }
Twist_Comprehensive_Exome_Covered_Targets_hg38.bed
95582
Twist_Exome_Core_Covered_Targets_hg38.bed
114651
Twist_Exome_RefSeq_targets_hg38.bed
95271
hg38_exome_comp_spikein_v2.0.2_targets_sorted.re_annotated.bed
93839
hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed
97634
```
Selon ces mesures, le meilleur est donc Twist Exome 2.0 plus Comprehensive Exome Spike-in...
# Réanalyse
## Comment récupérer les gènes récents ?
1. Article d'Alexis : revue manuelle des articles...
2. Via OMIM : on peut réquêter directement sur la date de création ! (Pas besoin de passer par l'API, ni de scraper, d'autant qu'ils bloquent les IPs qui font
a...). Syntaxe : ajouter "*" pour avoir seulement les gènes
date_created:2022/01/01-* entry:*
# Stratégies
Nouvelles entrées d'OMIM https://omim.org/search/advanced/entry
1. soit nouveau gènes (coche "*") : pas de function associée
2. soit nouvelle association sur ancien gènes (cocher "#")
"Download as"
## Phénotypes
Depuis 2021/01/01
https://omim.org/search?index=entry&start=1&search=&sort=score+desc%2C+prefix_sort+desc&limit=10&prefix=%23&date_created_from=2021%2F01%2F01&date_created_to=&date_updated_from=&date_updated_to=
Avec helix on extrait les gènes
```bash
cd ~/research/bisonex/code/reanalaysis
sed 's/\t//g' ~/Downloads/OMIM-Entry-Retrieval.tsv | save phenotypes-2021-01-01.tsv
```
On n'a des résultats que sur une correspondance partielle (\t$GENE)
```nu
open phenotypes-2021-01-01.tsv --raw | from tsv -n | each {|e| rg $"\t($e.column1)" ~/annex/data/bisonex/annotate/full} | str join | save phenotype-result.txt
```
## AFF3
Plusieurs matches pour [AFF3](https://www.omim.org/entry/619297). pLI à 1
Pour analyser tous les variants, on ajoute le nom de fichier pour chaque résultat et le tout est dans un TSV
```nu
open phenotypes-2021-01-01.tsv --raw | from tsv -n | each {|e| rg -H --no-heading $"\t($e.column1)" ~/annex/data/bisonex/annotate/full } | str join | save phenotype-result.tsv -f
```
2 candidats
- VOUS chr2:g.100104415G>A https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/577218/browser/ qq scores bioinfo en faveur, clinique colle partiellement (mais non spécifique). Déjà un VOUS sur autre gène
- (*) VOUS- chr2:g.99560401T>C NM_001386135.1:c.3155A>G https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/282096/browser/ -> Clinvar vous, spip élevé mais aucun des autres scores. Clinique lourde, overlap partiel (not. encéphalopathie). Exome rendu nég
Très peu probable
- (*) chr2:g.99707103C>T https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/577243/browser/ légère altération splice,. Clinique ne colle pas. Exome neg
- VOUS chr2:g.100006849G>T NM_001386135.1:c.656C>A https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/577248/browser/ -> éliminé sur la clinique car diabète + cause génétique retrouvée...
- https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/453600/browser/
- chr2:g.99601569_99601571del https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/115565/browser/
- chr2:g.99601569_99601571del CADD à 18 https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/115565/browser/
- chr2:g.99593527C>T https://mobidetails.iurc.montp.inserm.fr/MD/api/variant/577240/browser/
Paul : VOUS- pour les 2
Alexis a vérifié qu'ils sont bien dans les VCFs de centogène.
## Gènes récents
858 nouveaux gènes avec
https://omim.org/search?index=entry&start=1&search=&sort=score+desc%2C+prefix_sort+desc&limit=10&prefix=*&date_created_from=2021%2F01%2F01&date_created_to=&date_updated_from=&date_updated_to=
Pour filter avec seulement les pLI intéressantes
1. nettoyer le fichier omim (header footer) et renommer en OMIM-new-genes.tsv
Puis séparer le nom du gène en une nouvelle colonne
```nu
sd '; ' '\t' OMIM-new-genes.tsv
```
Et rajouter le header gene
2. télécharger scores pLI
```nu
wget https://storage.googleapis.com/gcp-public-data--gnomad/legacy/exac_browser/forweb_cleaned_exac_r03_march16_z_data_pLI_CNV-final.txt.gz
gunzip forweb_cleaned_exac_r03_march16_z_data_pLI_CNV-final.txt.gz
mv forweb_cleaned_exac_r03_march16_z_data_pLI_CNV-final.txt forweb_cleaned_exac_r03_march16_z_data_pLI_CNV-final.tsv
```
On fait l'intersection en prenant seuls ceux avec pLI > 0.5
```nu
let omim = open OMIM-new-genes.tsv
let pli = open forweb_cleaned_exac_r03_march16_z_data_pLI_CNV-final.tsv | select gene pLI$omim | join $pli Gene gene | first
$omim | join $pli Gene gene | where pLI > 0.5
```
On a 49 gènes mais pas forcément de clinique associée... On suit plutôt la méthode d'alexis (cf Clinical sypnosis)
Decipher ? pLI différentes de gnomAD car dépend du transcrit
## Clinical synopsis
On cherche tous les synopsis créés après le 1er janvier 2021. Ce ne sont pas forcément des nouveaux gènes ni de nouveaux phénotypes
https://omim.org/search?index=entry&sort=score desc, prefix_sort desc&start=1&limit=200&cs_exists=true&date_created_from=2021/01/01&format=tsv
On utilise un script Rust pour fusionner tous les TSV du pipeline en un seul (voir code/reanalyse/README.md) avec les gènes correspondants :
cargo run --release --bin candidates-phenotypes
Le fichier run_filtered.tsv contient l'intersection. On a 172 entités Omim et environ 3100 varians correspondants.
On filtre par gnomAD exome AF croissant.
Candidats en dominant
- SPEN:
- NM_015001.3:c.10709A>G VOUS 62982227 avec exome neg
- moitié scores bioinfo en faveur, non retrouvé dans gnomAD
- zones relativement intolérante aux missense (decipher), en fin de gène sans hotspot
- clinique : foetus avec malfo cardiaque, colle partielement
- 1 clinvar VOUS
- LMNB2:
- NM_032737.4:c.1634C>T VOUS- ? qq score bionfo, pas de hotspot, clinique colle partiellement sur la DI, exome neg
On suit les conseils d'alexis filtrer si < 1% pour dominant, 5% pour récessif
Pause : on regarde les tronquants en groupant par ID cento pour éviter de faire plusieurs fois le même patient:w
Pas de tronquant intéressant dans SPEN
Problèmes:
- Certains gnomaAD exome AF ont plusieurs valeurs sépares par des virgules
- idem pour CADD PHred
- idem pour spliceai AL AG DL DG
# Démographie
## Clinique
### Extraction
Extraction de la clinique au format HPO depuis les compte-rendus extraits en texte
```checkpipeline -i data -o clinical.csv```
### Nettoyage
1. Nettoyage à la main en découpant les lignes avec des virgules (+ divers)
2. Regarder les données pour corriger les erreurs avec espaces etc :
`open clinical.csv | get Clinical | str downcase | uniq -c | sort-by count`
3. On utilise un script Rust pour comparer les termes à la base de données HPO. Il faut télécharger localement la base et il faut la clinique dans ../data/clinical_nodup.csv.
Pour la base HPO, on va dans ~/code/hpo et il faut avoir téléchargé
data/
├── hp.obo
├── phenotype.hpoa
└── phenotype_to_genes.txt
```bash
cargo run --release --example obo_to_bin data ontology.hpo
cp ontology.hpo ~/research/bisonex/code/hpo
cd /research/bisonex/code/hpo
cabal run --release cleanup
```
Le fichier de sortie est `clinical_corrected.csv` avec une nouvelle colonne correspondant à l'annotation corrigée (`Clinical_corr`).
On regarde ensuite manuellement les termes qui n'ont pas été retrouvés
```nu
open clinical_corrected.csv | where ($it.Clinical_corr | is-empty) | get Clinical | sort | uniq
```
### Catégories
À partir de ../data/clinical_corrected.csv, on va regarder les parents dans la nomenclature HPO qui sont à une distance de 3 de la racine et le résultat est généré dans clinical_main.csv
```bash
cargo run --release --bin clinical_main
```
Puis on regarde les termes les plus courants et les catégories
```nu
open clinical_corrected.csv | get Clinical_corr | uniq -c | sort-by count | to csv
open clinical_main.csv | select major_parent len | sort-by len | to csv
```
### Graphe
Génération d'un fichier pour graphviz (test.dot) partir de clinical_main.csv
```bash
cargo run --release --bin graph
dot -Tsvg test.dot | save test.svg -f
```
### Divers
NB: les dates ont été trouvées avec
`rg "Report date: .*" data/* -o -I | lines | each {|e| $e | str replace "Report date: " "" } | save dates.txt`
et en regardant à la main des dates extrémales
Attention, certains patients sont en double:
```julia
using DataFramesMeta, CSV
d = CSV.read("clinical.csv", DataFrame)
d2 = @by d :Patient :ER :mysum = length(:ER)
ids = : ids = findall(nonunique(@select d2 :Patient))
d2[ids, :]
# 52 Duplicate patients
nrow(unique(d2[ids, :]))
# 911 patients
nrow(unique(@select d :Patient))
```
Si on veut supprimer les doublons, on les a directement. Mais on supprime l'un des doublons au hasard...
```julia
CSV.write("todelete.csv", d2[ids, [:ER]])
```
On enlève "ER " à lmain de ce fichier puis
```
rg -v -f todelete.csv clinical.csv | save clinical_nodup.csv
```
### Résultats
Termes HPO les plus courants (>= 40)
```nu
open ../data/clinical_nodup.csv | get Clinical | uniq -c | sort-by count | where count >= 40 | to csv
```
## Résultats d'exomes
1178 exomes dont 586 négatifs
```nu
open ~/annex/data/centogene/variants/variants_centogene_final.csv | select "patient ID" "genomic (hg38)" | uniq | where "genomic (hg38)" == "negatif" | length
open ~/annex/data/centogene/variants/variants_centogene_final.csv | select "patient ID" "genomic (hg38)" | uniq | length 1178
```
1056 patients uniques avec exome
```nu
open ~/annex/data/centogene/variants/variants_centogene_final.csv | get "patient ID" | uniq | length
```
247 données brutes
```nu
ls ~/annex/data/bisonex/call_variant/haplotypecaller/2* | length
```
562 variant uniques:
```nu
open ~/annex/data/centogene/variants/variants_centogene_final.csv | where coding != negatif | select "transcript (cento)" conudin g | uniq | length
```
Classification variant (attention, 4 variants en trop, on en enlève 1 partout.): TODO
Type variants
```nu
❯ open ~/annex/data/centogene/variants/variants_centogene_final.csv | get "genomic (hg38)" | uniq -c | where value =~ ">" | length
❯ open ~/annex/data/centogene/variants/variants_centogene_final.csv | get "genomic (hg38)" | uniq -c | where value =~ "del" | length
❯ open ~/annex/data/centogene/variants/variants_centogene_final.csv | get "genomic (hg38)" | uniq -c | where value =~ "dup" | length
❯ open ~/annex/data/centogene/variants/variants_centogene_final.csv | get "genomic (hg38)" | uniq -c | where value =~ "ins" | length
```
Zygosity
```nu
❯ open ~/annex/data/centogene/variants/variants_centogene_final.csv | select "genomic (hg38)" zygosity | uniq | get zygosity | uniq -c | to csv
```
## Versions
`nix flake show`
git+file:///home/alex/code/bisonex
└───packages
└───x86_64-linux
├───awscli2: package 'awscli2-2.11.20'
├───bcftools: package 'bcftools-1.17'
├───bedtools: package 'bedtools-2.31.0'
├───bwa: package 'bwa-unstable-2022-09-23'
├───default: package 'nextflow-22.10.6'
├───dos2unix: package 'dos2unix-7.4.4'
├───fastqc: package 'fastqc'
├───gatk: package 'gatk-4.4.0.0'
├───hap-py: package 'hap.py'
├───htslib: package 'htslib-1.17'
├───mosdepth: package 'mosdepth-0.3.3'
├───multiqc: package 'multiqc-1.15'
├───picard-tools: package 'picard-tools-3.0.0'
├───python: package 'python3-3.10.12-env'
├───r: package 'R-4.2.3-wrapper'
├───rtg-tools: package 'rtg-tools-3.12.1'
├───samtools: package 'samtools-1.17'
├───spip: package 'spip'
├───sratoolkit: package 'vcftools-0.1.16'
├───vcftools: package 'vcftools-0.1.16'
└───vep: package 'perl5.36.0-ensembl-vep-110'
# Comparaison avec cento
On utilise `code/reanalysis/compare-cento.jl` qui extrait les varians (non négatifs) de `~/annex/data/centogene/variants/variants_centogene_final.csv` et les cherche dans les TSV correspnodant à la sortie du pipeline dans `~/annex/data/bisonex/annotate/full/`
```nu
cd code/reanalysis
julia --project=.. compare-to-cento.jl | str join | save compare-to-cento.txt -f
```
153 négatifs
```nu
rg "not in list" compare-to-cento.txt | rg "^6" | wc -l
```
94 retrouvés
```nu
rg -v "not in list" compare-to-cento.txt | rg Found | wc -l
```
Types de variants : 7 del, 4 dup, 1 ins et reste SNV
❯ rg found compare-to-cento.txt -i | rg "del" | wc -l
❯ rg found compare-to-cento.txt -i | rg "dup" | wc -l
❯ rg found compare-to-cento.txt -i | rg "ins" | wc -l
4 échecs
```nu
❯ rg -v "not in list" compare-to-cento.txt | rg FAILED
FAILED to find chr17:g.7884996C>T CHD3
FAILED to find chr10:g.102230760del PITX3
FAILED to find chr15:g.26869324A>T GABRA5
FAILED to find chr11:g.14358800C>A RRAS2
```
"Au total, 4 variants non retrouvés sur 121 positifs, tous VOUS"
NB: Mail 1: erreur : NM_001372044.2:c.3727dup est bien retrouvé alors que dit initialement que non.
- CHD3 : "une horreur à désigner"
- PITX3 : "riche en GC avec un homopolymère de 7G au niveau de la del"
- GABRA5 : "pas particulièremnt difficile"
Pourquoi sont-ils filtrés ? Notre seuil : 30 reads et 10 porteurs
- CHD3: pas assez de reads (22) : `GT:AD:DP:GQ:PL 0/1:5,22:27:43:507,0,43`
- PITX3 : pas assez de reads avec la variation (8) : attention à la représentation car homopolymère, il est en fait en 102230753 `GT:AD:DP:GQ:PL 0/1:26,8:34:99:146,0,671`
- GABRA5 : pas assez de reads (15) et pas assez avec variation (6) `GT:AD:DP:GQ:PL 0/1:9,6:15:99:103,0,213`
- RRAS2: pas assez de reads mais un seuil à 29 suffirait ! ` GT:AD:DP:GQ:PL 0/1:15,14:29:99:310,0,331`
## Profondeur
On extrait les positions et ID avec ~/research/code/bisonex/reanalysis/depth.nu
Les positions au format bed sont convertie en hg19 avec liftover.
Puis on requête les VCF pour avoir la profondeur et les reads porteur avec
` bcftools query -f '%CHROM %POS %REF %ALT %DP [ %AD ]\n' XXX.fboth.pass.vcf.gz chr17:7788314-7788314`
CHROM POS REF ALT DEPTH AD
chr10 103990510 CG C 45 24,8
chr17 7788314 C T 29 5,21
chr15 27114471 A T 78 34,42
chr11 14380346 C A 29 15,14
## Au final
- GABRA5 était du à un FASTQ incomplet : on le retrouve bien en sortie.
- Les 3 autres variants devraient pouvoir être rattrapés en diminuant un peu les filtres
### pIL
1 OTUD5 non
2 FGF13: clinique ne colle qu'à moitié, déjà vu avec paul
3 TFE3 : 3 variants clinvar bénin/probablement bénin
### Clinvar patho
ATTENTION: on a oublié de regarder l'intersection donc beaucoup trop de variants...
Tronquants et omim dominants
- ACAD11 : chr3:g.132659715C>A chez 63037731 non décrit en maladie humaine
- exome neg avec 2 "potential relevant finding". CNOT3 et YY1
- en faveur : tronquant rapporté 1x clinvar et dans région échappant NMD (decipher ?), spip 45%
- en défaveur : pLi = 0, spliceAI normal, pas d'OMIM, Franklin l'annote sur UBA5 et le rend VOUS-
- BTD : VOUS clinvar , exome neg, pLI = 0 63078133, ne correspond pas à la clinique, ne correspond pas à la clinique
Faux-sens
- ACAD11: bof, faux-sens avec prédiction NM_032169.5(ACAD11):c.1129C>T score bioinfo en faveur mais pas de hospot, pas de clinique
## Nouveaux clinical synopsis nettoys
# Performances
Statistiques pour tout le pipeline
- montrer le timeline report (à partir du trace report)
- regarder execution report https://www.nextflow.io/docs/latest/tracing.html#execution-report
## BWA mem
Le plus simple est d'utiliser hyperfine pour lancer des runs successifs en faisant varier le numbre de threads.
Il est trop long de tester plusieurs fois chaque possibilité. Test sur 3 runs en séquentiel (50G mémoire)
Time (mean ± σ): 13725.110 s ± 42.090 s [User: 13870.522 s, System: 12.543 s]
Range (min … max): 13677.219 s … 13756.226 s 3 runs
Donc un seul run par configuration avec une demande de 32 coeurs et 50G (en prod, 24 coeurs et 50G) de manière décroissante
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="speedup-bwa"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 24:00:00
#SBATCH --partition=smp
#SBATCH -c 32 ## request 16 cores (MAX is 32)
#SBATCH --mem=50G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix/2.11.0
# Requires a working directory by nextflow
cd /Work/Users/apraga/bisonex/work/8c/cf49fd4508404faa6986ca4b211e49
INDEX=`find -L ./ -name "*.amb" | sed 's/\.amb$//'`
# 1 run for each numbers of threads
# JSON output is needed to have the stats for each run
# Don't do all configuration and starts from the more expensive
hyperfine --export-json hyperfine.json -L threads 32,24,16,8,4,2,1 --runs 1 "bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t {threads} $INDEX k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz"
Note : il faut bien avoir le JSON en format de sortie pour avoir le temps de chaque run et pas seulement les statitistiques.
Note : penser à bien remplacer le nombre de threads !
Résultat dans research/bisonex/code/plot/speedup-bwa.json
Benchmark 1: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 32 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 684.408 s [User: 18007.678 s, System: 115.024 s]
Benchmark 2: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 24 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 760.921 s [User: 17606.845 s, System: 102.416 s]
Benchmark 3: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 16 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 1115.368 s [User: 17417.684 s, System: 115.360 s]
Benchmark 4: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 8 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 2149.281 s [User: 17083.367 s, System: 91.516 s]
Benchmark 5: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 4 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 4262.605 s [User: 16936.132 s, System: 177.830 s]
Benchmark 6: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 2 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 8313.601 s [User: 16411.816 s, System: 332.537 s]
Benchmark 7: bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 1 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
Time (abs ≡): 15049.006 s [User: 15075.411 s, System: 134.121 s]
Summary
bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 32 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz ran
1.11 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 24 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
1.63 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 16 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
3.14 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 8 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
6.23 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 4 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
12.15 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 2 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz
21.99 times faster than bwa mem -R '@RG\tID:sample\tSM:sample\tPL:ILLUMINA\tPM:Miseq\tCN:CHU_Minjoz\tLB:definition_to_add' -t 1 ./bwa/GCA_000001405.15_GRCh38_full_analysis_set.fna k12\:15b90d6813adf03787af0b2b90d708f1.fastq.gz k12\:9c41c3effaeead78e1becee8a49cb0ea.fastq.gz## Haplotypecaller
## Haplotypecaller
Pour haplotypecaller, il n'y a plus moyen de spécifier les threads avec la version 4... Est-ce vraiment parallélisé ?
PairHMM au moins est parallélisé avec OpenMP. On teste avec
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="speedup-haplo"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 24:00:00
#SBATCH --partition=smp
#SBATCH -c 16 ## request 16 cores (MAX is 32)
#SBATCH --mem=16G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix/2.11.0
cd /Work/Users/apraga/bisonex/work/d4/5b6f4963533a6319c3c67d4731e511
hyperfine --export-json hyperfine-haplo -L threads 12,6,4,2,1 --runs 1 "gatk --java-options \"-Xmx13107M\" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2"
Résultat dans research/bisonex/code/plot/speedup-haplo.json
Benchmark 1: gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 12)
Time (abs ≡): 14428.832 s [User: 14399.964 s, System: 213.747 s]
Benchmark 2: gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 6)
Time (abs ≡): 14164.462 s [User: 14128.192 s, System: 198.870 s]
Benchmark 3: gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 4)
Time (abs ≡): 14215.604 s [User: 14214.320 s, System: 216.795 s]
Benchmark 4: gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 2)
Time (abs ≡): 14308.381 s [User: 14294.416 s, System: 213.043 s]
Benchmark 5: gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 1)
Time (abs ≡): 14136.094 s [User: 14064.118 s, System: 175.672 s]
Summary
gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 1) ran
1.00 times faster than gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 6)
1.01 times faster than gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 4)
1.01 times faster than gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 2)
1.02 times faster than gatk --java-options "-Xmx13107M" HaplotypeCaller --input HG001-HiSeq4000-Agilentv7-GRCh38.recal.bam --output HG001-HiSeq4000-Agilentv7-GRCh38.haplotypecaller.vcf.gz --reference GCA_000001405.15_GRCh38_full_analysis_set.fna --dbsnp GCF_000001405.40.gz --tmp-dir . --max-mnp-distance 2 (threads = 12)# Reproductibilité
On utilise nix 23.05 dans un flake : les version ne bougent donc pas.
Liste des versions données par
nix profile list | awk '{print $4}' | awk -F '-' '{print "|" $2" | "$3}'
| Logiciels | Version validée |
| --- | --- |
| R | 4.2.3 |
| bwa | unstable |
| rtg-tools | ?s |
| gatk | 4.4.0.0 |
| spip | |
| awscli2 | 2.11.20 |
| fastqc | |
| hap.py | |
| htslib | 1.17 |
| multiqc | 1.15 |
| picard | ? |
| python3 | 3.10.12 |
| zoxide | 0.9.2 |
| bcftools | 1.17 |
| bedtools | 2.31.0 |
| dos2unix | 7.4.4 |
| ensembl | perl5.36.0 + ?
| mosdepth | 0.3.3 |
| nextflow | 22.10.6 |
| samtools | 1.17 |
| spliceai | 1.3.1 |
| vcftools | 0.1.16 |
| ensembl | perl5.36.0 |
| sratoolkit | 2.11.3 |
[Performances](performances.md)
[Validation](validation.md)
[Tests in silico](insilico.md)
# Guix vs nix
Guix plus utilisé en HPC (France). Nix plus populaire ?
# Outils de gestion de workflow
Nextflow, snakemake, CWL = les plus gros.
Mais d'autres tentatives
- Guix : guixwl mais échec -> ccwl
- Haskell : funflow, ngless
- Ocaml : bistro
Attention, les jobs sont coupés à minuit, donc il faut lancer netxflow depuis un job slurm.
Ex:
```slurm
#!/bin/bash -l
# Fichier submission.SBATCH
#SBATCH --job-name="bisonex-varben"
#SBATCH --output=%x.%J.out ## %x=job name, %J=job id
#SBATCH --error=%x.%J.out
# walltime (hh:mm::ss) max is 8 days
#SBATCH -t 24:00:00
#SBATCH --partition=smp
#SBATCH -c 1 ## request 16 cores (MAX is 32)
#SBATCH --mem=12G ## (MAX is 96G)
#SBATCH --mail-user=apraga@chu-besancon.fr
#SBATCH --mail-type=END,FAIL # notify when job end/fail
module load nix/2.11.0
# Otherwise job fails as it cannot write to $HOME/.nextflow
export NXF_HOME=/Work/Users/apraga/.nextflow
# user.name must be forced (again... our fix does not seem to work in nix)
nextflow -Duser.name=apraga run main.nf -profile standard,helios --input=sanger-varben.csv --genome=GRCh38 -resume 8749b270-ca2c-4f72-82d9-69fff6563c56
```
# Guix vs nix
Guix plus utilisé en HPC (France). Nix plus populaire ?
# Outils de gestion de workflow
Nextflow, snakemake, CWL = les plus gros.
Mais d'autres tentatives
- Guix : guixwl mais échec -> ccwl
- Haskell : funflow, ngless
- Ocaml : bistro
[book]
authors = ["Alexis Praga"]
language = "en"
multilingual = false
src = "."
title = "Notes bisonex"
Pour recoder en rust xamscissors, il nous manque la position des réferences sur lequel le read s'aligne si on utilise la library noodles. Il faudrait recoder dans noodles `get_aligned ` de pysam
https://github.com/pysam-developers/pysam/blob/cdc0ed12fbe2d7633b8fa47534ab2c2547f66b84/pysam/libcalignedsegment.pyx
Dépendences à installer avec nix
- Aligneur
- Non: BWA-mem, bwa-mem2 (déjà disponibles), Sentieon (payant)
- TODO Dragmap
# Résumé
- [Performances](performances.md)
- [Validation](validation/summary.md)
- [Configuration](validation/configuration.md)
- [Données réelles](validation/real/summary.md)
- [Inspiration](validation/real/inspiration.md)
- [GIAB](validation/real/giab.md)
- [In silico](validation/insilico/summary.md)
- [Varben](validation/insilico/varben.md)
- [Simuscop](validation/insilico/simuscop.md)
- [Réanalyse](reanalyse/summary.md)
- [Démographie](reanalyse/demographie.md)
- [Comparaison avec cento](reanalyse/compare-cento.md)
- [Résultats](reanalyse/resultat.md)
- [Divers](divers.md)
book